tmcdonell · tomsmeding · Aug 13, 2025 · Sep 14, 2025 · Sep 14, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,19 @@ Notable changes to the project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
 **NOTE:** The version numbers of this package roughly align to the latest
-version of the CUDA API this package is built against This means that this
+version of the CUDA API this package is built against. This means that this
 package _DOES NOT_ follow the PVP, or indeed any sensible version scheme,
 because NVIDIA are A-OK introducing breaking changes in minor updates.
 
 
+## [0.13.0.0] - ???
+### Added
+  * Support for CUDA-13
+
+### Removed
+  * A number of fields from DeviceProperties, as they have been removed from
+    `cudaDeviceProp`. Use `Foreign.CUDA.Driver.Device.attribute` to query them.
+
 ## [0.12.8.0] - 2025-08-21
 ### Added
   * Support for CUDA-12

diff --git a/README.md b/README.md
@@ -150,10 +150,13 @@ Here is an incomplete historical list of missing bindings. Pull requests welcome
 - cuGraphMemAllocNodeGetParams
 - cuGraphMemFreeNodeGetParams
 
-### CUDA-12
+### CUDA >= 12
 
 A lot. PRs welcome.
 
+- CUDA-12.3
+  - Edge data in the driver Graph API (`cuGraphAddDependencies_v2` etc.)
+
 
 # Old compatibility notes
 

diff --git a/cbits/stubs.c b/cbits/stubs.c
@@ -3,6 +3,7 @@
  */
 
 #include "cbits/stubs.h"
+#include <string.h>  // memset
 
 #if CUDART_VERSION >= 7000
 cudaError_t cudaLaunchKernel_simple(const void *func, unsigned int gridX, unsigned int gridY, unsigned int gridZ, unsigned int blockX, unsigned int blockY, unsigned int blockZ, void **args, size_t sharedMem, cudaStream_t stream)
@@ -196,7 +197,13 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev)
 
 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
 {
+#if CUDA_VERSION >= 13000
+    CUctxCreateParams params;
+    memset(&params, 0, sizeof params);
+    return cuCtxCreate_v4(pctx, &params, flags, dev);
+#else
     return cuCtxCreate_v2(pctx, flags, dev);
+#endif
 }
 
 CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
@@ -424,3 +431,18 @@ CUresult CUDAAPI cuGraphExecKernelNodeSetParams_simple(CUgraphExec hGraphExec, C
 }
 #endif
 
+#if CUDA_VERSION >= 13000
+// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell.
+CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device)
+{
+  return cuMemAdvise(dptr, count, advice, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE});
+}
+
+// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell.
+CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream)
+{
+  // flags is reserved and must be 0 in CUDA 13
+  return cuMemPrefetchAsync(dptr, count, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE}, 0, hStream);
+}
+#endif
+
diff --git a/cbits/stubs.h b/cbits/stubs.h
@@ -184,6 +184,11 @@ CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
 CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
 #endif
 
+#if CUDA_VERSION >= 13000
+CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device);
+CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream);
+#endif
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/cuda.cabal b/cuda.cabal
@@ -1,7 +1,7 @@
 cabal-version:          1.24
 
 Name:                   cuda
-Version:                0.12.8.0
+Version:                0.13.0.0
 Synopsis:               FFI binding to the CUDA interface for programming NVIDIA GPUs
 Description:
     The CUDA library provides a direct, general purpose C-like SPMD programming
@@ -30,7 +30,7 @@ Description:
     .
     * "Foreign.CUDA.Runtime"
     .
-    Tested with library versions up to CUDA-12.8. See also the
+    Tested with library versions up to CUDA-13.0. See also the
     <https://travis-ci.org/tmcdonell/cuda travis-ci.org> build matrix for
     version compatibility.
     .
@@ -177,6 +177,6 @@ source-repository head
 source-repository this
     type:               git
     location:           https://github.com/tmcdonell/cuda
-    tag:                v0.12.8.0
+    tag:                v0.13.0.0
 
 -- vim: nospell
diff --git a/examples/src/deviceQueryDrv/DeviceQuery.hs b/examples/src/deviceQueryDrv/DeviceQuery.hs
@@ -5,6 +5,7 @@
 module Main where
 
 import Control.Monad
+import Foreign.Marshal.Utils                            ( toBool )
 import Numeric
 import Prelude                                          hiding ( (<>) )
 import Text.PrettyPrint
@@ -15,6 +16,31 @@ import Foreign.CUDA.Analysis                            as CUDA
 import qualified Foreign.CUDA.Driver                    as CUDA
 
 
+-- In CUDA 13, a number of device properties became things only queryable using
+-- cu(da)DeviceGetAttribute. This data type captures those.
+$(if CUDA.libraryVersion >= 13000 then [d|
+    data AttrProperties = AttrProperties
+      { clockRate                     :: !Int             -- ^ Clock frequency in kilohertz
+      , memClockRate                  :: !Int             -- ^ Peak memory clock frequency in kilohertz
+      , computeMode                   :: !ComputeMode
+      , kernelExecTimeoutEnabled      :: !Bool            -- ^ Whether there is a runtime limit on kernels
+      , singleToDoublePerfRatio       :: !Int             -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance
+      }
+    getAttrProperties :: Device -> IO AttrProperties
+    getAttrProperties d = do
+      clockRate <- CUDA.attribute d CUDA.ClockRate
+      memClockRate <- CUDA.attribute d CUDA.MemoryClockRate
+      computeMode <- toEnum <$> CUDA.attribute d CUDA.ComputeMode
+      kernelExecTimeoutEnabled <- toBool <$> CUDA.attribute d CUDA.KernelExecTimeout
+      singleToDoublePerfRatio <- CUDA.attribute d CUDA.SingleToDoublePrecisionPerfRatio
+      return AttrProperties{..}
+  |] else [d|
+    data AttrProperties = AttrProperties {}
+    getAttrProperties :: Device -> IO AttrProperties
+    getAttrProperties _ = return AttrProperties
+  |])
+
+
 main :: IO ()
 main = do
   version    <- CUDA.driverVersion
@@ -32,16 +58,17 @@ main = do
   infos   <- forM [0 .. numDevices-1] $ \n -> do
     dev <- CUDA.device n
     prp <- CUDA.props dev
-    return (n, dev, prp)
+    prp2 <- getAttrProperties dev
+    return (n, dev, prp, prp2)
 
-  forM_ infos $ \(n, dev, prp) -> do
+  forM_ infos $ \(n, dev, prp, prp2) -> do
     p2p <- statP2P dev prp infos
-    printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp)
+    printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp prp2)
     unless (null p2p) $ printf "%s\n" p2p
 
 
-statDevice :: DeviceProperties -> String
-statDevice dev@DeviceProperties{..} =
+statDevice :: DeviceProperties -> AttrProperties -> String
+statDevice dev@DeviceProperties{..} AttrProperties{..} =
   let
       DeviceResources{..} = deviceResources dev
 
@@ -69,22 +96,34 @@ statDevice dev@DeviceProperties{..} =
         ,("  2D:",                                      grid maxTextureDim2D)
         ,("  3D:",                                      cube maxTextureDim3D)
         ,("Texture alignment:",                         text $ showBytes textureAlignment)
-        ,("Maximum memory pitch:",                      text $ showBytes memPitch)
-        ,("Concurrent kernel execution:",               bool concurrentKernels)
-        ,("Concurrent copy and execution:",             bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))
-        ,("Runtime limit on kernel execution:",         bool kernelExecTimeoutEnabled)
+        ,("Maximum memory pitch:",                      text $ showBytes memPitch)]++
+
+        $(if CUDA.libraryVersion >= 13000 then [|
+        [("Concurrent copy and kernel execution:",      bool concurrentKernels <> text (printf " with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))]
+        |] else [|
+        [("Concurrent kernel execution:",               bool concurrentKernels)
+        ,("Concurrent copy and execution:",             bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))]
+        |])++
+
+        [("Runtime limit on kernel execution:",         bool kernelExecTimeoutEnabled)
         ,("Integrated GPU sharing host memory:",        bool integrated)
         ,("Host page-locked memory mapping:",           bool canMapHostMemory)
         ,("ECC memory support:",                        bool eccEnabled)
         ,("Unified addressing (UVA):",                  bool unifiedAddressing)]++
-#if __GLASGOW_HASKELL__ > 710
+
         $(if CUDA.libraryVersion >= 8000 then [|
         [("Single to double precision performance:",    text $ printf "%d : 1" singleToDoublePerfRatio)
-        ,("Supports compute pre-emption:",              bool preemption)]|] else [|[]|])++
+        ,("Supports compute pre-emption:",              bool preemption)]
+        |] else [|[]|])++
+
         $(if CUDA.libraryVersion >= 9000 then [|
-        [("Supports cooperative launch:",               bool cooperativeLaunch)
-        ,("Supports multi-device cooperative launch:",  bool cooperativeLaunchMultiDevice)]|] else [|[]|])++
-#endif
+        [("Supports cooperative launch:",               bool cooperativeLaunch)]
+        |] else [|[]|])++
+
+        $(if CUDA.libraryVersion >= 9000 && CUDA.libraryVersion < 13000 then [|
+        [("Supports multi-device cooperative launch:",  bool cooperativeLaunchMultiDevice)]
+        |] else [|[]|])++
+
         [("PCI bus/location:",                          int (busID pciInfo) <> char '/' <> int (deviceID pciInfo))
         ,("Compute mode:",                              text (show computeMode))
         ]
@@ -94,7 +133,7 @@ statDevice dev@DeviceProperties{..} =
     $ text (describe computeMode)
 
 
-statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties)] -> IO String
+statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties, AttrProperties)] -> IO String
 statP2P dev prp infos
   | CUDA.libraryVersion < 4000          = return []
 
@@ -103,7 +142,7 @@ statP2P dev prp infos
   | otherwise
   = let
         go []                 = return []
-        go ((m, peer, pp):is) =
+        go ((m, peer, pp, _):is) =
           if dev == peer
             then go is
             else do

diff --git a/src/Foreign/CUDA/Analysis/Device.chs b/src/Foreign/CUDA/Analysis/Device.chs
@@ -32,13 +32,19 @@ import Debug.Trace
 -- |
 -- The compute mode the device is currently in
 --
+#if CUDA_VERSION < 13000
 {# enum CUcomputemode as ComputeMode
     { underscoreToCase }
     with prefix="CU_COMPUTEMODE" deriving (Eq, Show) #}
+#else
+{# enum cudaComputeMode as ComputeMode
+    { }
+    with prefix="cudaComputeMode" deriving (Eq, Show) #}
+#endif
 
 instance Describe ComputeMode where
   describe Default          = "Multiple contexts are allowed on the device simultaneously"
-#if CUDA_VERSION < 8000
+#if CUDA_VERSION < 8000 || CUDA_VERSION >= 13000
   describe Exclusive        = "Only one context used by a single thread can be present on this device at a time"
 #endif
   describe Prohibited       = "No contexts can be created on this device at this time"
@@ -69,7 +75,10 @@ cap a b = let a' = fromIntegral a in
 --}
 
 -- |
--- The properties of a compute device
+-- The properties of a compute device, mirroring @struct cudaDeviceProp@ in CUDA.
+--
+-- In CUDA-13.0, a number of fields were removed from this struct and are now
+-- only available by querying individual attributes on the device.
 --
 data DeviceProperties = DeviceProperties
   {
@@ -91,16 +100,22 @@ data DeviceProperties = DeviceProperties
   , maxTextureDim2D               :: !(Int,Int)
   , maxTextureDim3D               :: !(Int,Int,Int)
 #endif
+#if CUDA_VERSION < 13000
   , clockRate                     :: !Int             -- ^ Clock frequency in kilohertz
+#endif
   , multiProcessorCount           :: !Int             -- ^ Number of multiprocessors on the device
   , memPitch                      :: !Int64           -- ^ Maximum pitch in bytes allowed by memory copies
 #if CUDA_VERSION >= 4000
   , memBusWidth                   :: !Int             -- ^ Global memory bus width in bits
+#if CUDA_VERSION < 13000
   , memClockRate                  :: !Int             -- ^ Peak memory clock frequency in kilohertz
+#endif
 #endif
   , textureAlignment              :: !Int64           -- ^ Alignment requirement for textures
+#if CUDA_VERSION < 13000
   , computeMode                   :: !ComputeMode
   , deviceOverlap                 :: !Bool            -- ^ Device can concurrently copy memory and execute a kernel
+#endif
 #if CUDA_VERSION >= 3000
   , concurrentKernels             :: !Bool            -- ^ Device can possibly execute multiple kernels concurrently
   , eccEnabled                    :: !Bool            -- ^ Device supports and has enabled error correction
@@ -111,7 +126,9 @@ data DeviceProperties = DeviceProperties
   , pciInfo                       :: !PCI             -- ^ PCI device information for the device
   , tccDriverEnabled              :: !Bool            -- ^ Whether this is a Tesla device using the TCC driver
 #endif
+#if CUDA_VERSION < 13000
   , kernelExecTimeoutEnabled      :: !Bool            -- ^ Whether there is a runtime limit on kernels
+#endif
   , integrated                    :: !Bool            -- ^ As opposed to discrete
   , canMapHostMemory              :: !Bool            -- ^ Device can use pinned memory
 #if CUDA_VERSION >= 4000
@@ -129,11 +146,15 @@ data DeviceProperties = DeviceProperties
 #endif
 #if CUDA_VERSION >= 8000
   , preemption                    :: !Bool            -- ^ Device supports compute pre-emption
+#if CUDA_VERSION < 13000
   , singleToDoublePerfRatio       :: !Int             -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance
 #endif
+#endif
 #if CUDA_VERSION >= 9000
   , cooperativeLaunch             :: !Bool            -- ^ Device supports launching cooperative kernels
+#if CUDA_VERSION < 13000
   , cooperativeLaunchMultiDevice  :: !Bool            -- ^ Device can participate in cooperative multi-device kernels
+#endif
 #endif
   }
   deriving (Show)

diff --git a/src/Foreign/CUDA/Driver/Device.chs b/src/Foreign/CUDA/Driver/Device.chs
@@ -246,7 +246,10 @@ attribute !d !a = cuDeviceGetAttribute a d
     peekS s _ = peekCString s
 
 
--- | Returns a UUID for the device
+-- | Returns a UUID for the device.
+--
+-- Since CUDA-13: If the device is in MIG mode, this function returns its MIG
+-- UUID which uniquely identifies the subscribed MIG compute instance.
 --
 -- Requires CUDA-9.2
 --
@@ -265,10 +268,17 @@ uuid !dev =
     unpack ptr
   where
     {-# INLINE cuDeviceGetUuid #-}
+#if CUDA_VERSION < 13000
     {# fun unsafe cuDeviceGetUuid
       {           `Ptr ()'
       , useDevice `Device'
       } -> `()' checkStatus*- #}
+#else
+    {# fun unsafe cuDeviceGetUuid_v2 as cuDeviceGetUuid
+      {           `Ptr ()'
+      , useDevice `Device'
+      } -> `()' checkStatus*- #}
+#endif
 
     {-# INLINE unpack #-}
     unpack :: Ptr () -> IO UUID
@@ -319,7 +329,9 @@ props !d = do
   sharedMemPerBlock     <- fromIntegral <$> attribute d SharedMemoryPerBlock
   memPitch              <- fromIntegral <$> attribute d MaxPitch
   textureAlignment      <- fromIntegral <$> attribute d TextureAlignment
+#if CUDA_VERSION < 13000
   clockRate             <- attribute d ClockRate
+#endif
   warpSize              <- attribute d WarpSize
   regsPerBlock          <- attribute d RegistersPerBlock
   maxThreadsPerBlock    <- attribute d MaxThreadsPerBlock
@@ -333,9 +345,11 @@ props !d = do
   computeCapability             <- capability d
   totalGlobalMem                <- totalMem d
   multiProcessorCount           <- attribute d MultiprocessorCount
+#if CUDA_VERSION < 13000
   computeMode                   <- toEnum <$> attribute d ComputeMode
   deviceOverlap                 <- toBool <$> attribute d GpuOverlap
   kernelExecTimeoutEnabled      <- toBool <$> attribute d KernelExecTimeout
+#endif
   integrated                    <- toBool <$> attribute d Integrated
   canMapHostMemory              <- toBool <$> attribute d CanMapHostMemory
 #if CUDA_VERSION >= 3000
@@ -350,7 +364,9 @@ props !d = do
   cacheMemL2                    <- attribute d L2CacheSize
   maxThreadsPerMultiProcessor   <- attribute d MaxThreadsPerMultiprocessor
   memBusWidth                   <- attribute d GlobalMemoryBusWidth
+#if CUDA_VERSION < 13000
   memClockRate                  <- attribute d MemoryClockRate
+#endif
   pciInfo                       <- PCI <$> attribute d PciBusId <*> attribute d PciDeviceId <*> attribute d PciDomainId
   unifiedAddressing             <- toBool <$> attribute d UnifiedAddressing
   tccDriverEnabled              <- toBool <$> attribute d TccDriver
@@ -367,11 +383,15 @@ props !d = do
 #endif
 #if CUDA_VERSION >= 8000
   preemption                    <- toBool <$> attribute d ComputePreemptionSupported
+#if CUDA_VERSION < 13000
   singleToDoublePerfRatio       <- attribute d SingleToDoublePrecisionPerfRatio
 #endif
+#endif
 #if CUDA_VERSION >= 9000
   cooperativeLaunch             <- toBool <$> attribute d CooperativeLaunch
+#if CUDA_VERSION < 13000
   cooperativeLaunchMultiDevice  <- toBool <$> attribute d CooperativeMultiDeviceLaunch
+#endif
 #endif
 
   return DeviceProperties{..}