Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@ Notable changes to the project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/).

**NOTE:** The version numbers of this package roughly align to the latest
version of the CUDA API this package is built against This means that this
version of the CUDA API this package is built against. This means that this
package _DOES NOT_ follow the PVP, or indeed any sensible version scheme,
because NVIDIA are A-OK introducing breaking changes in minor updates.


## [0.13.0.0] - ???
### Added
* Support for CUDA-13

### Removed
* A number of fields from DeviceProperties, as they have been removed from
`cudaDeviceProp`. Use `Foreign.CUDA.Driver.Device.attribute` to query them.

## [0.12.8.0] - 2025-08-21
### Added
* Support for CUDA-12
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,13 @@ Here is an incomplete historical list of missing bindings. Pull requests welcome
- cuGraphMemAllocNodeGetParams
- cuGraphMemFreeNodeGetParams

### CUDA-12
### CUDA >= 12

A lot. PRs welcome.

- CUDA-12.3
- Edge data in the driver Graph API (`cuGraphAddDependencies_v2` etc.)


# Old compatibility notes

Expand Down
22 changes: 22 additions & 0 deletions cbits/stubs.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

#include "cbits/stubs.h"
#include <string.h> // memset

#if CUDART_VERSION >= 7000
cudaError_t cudaLaunchKernel_simple(const void *func, unsigned int gridX, unsigned int gridY, unsigned int gridZ, unsigned int blockX, unsigned int blockY, unsigned int blockZ, void **args, size_t sharedMem, cudaStream_t stream)
Expand Down Expand Up @@ -196,7 +197,13 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev)

CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
{
#if CUDA_VERSION >= 13000
CUctxCreateParams params;
memset(&params, 0, sizeof params);
return cuCtxCreate_v4(pctx, &params, flags, dev);
#else
return cuCtxCreate_v2(pctx, flags, dev);
#endif
}

CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
Expand Down Expand Up @@ -424,3 +431,18 @@ CUresult CUDAAPI cuGraphExecKernelNodeSetParams_simple(CUgraphExec hGraphExec, C
}
#endif

#if CUDA_VERSION >= 13000
// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell.
CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device)
{
return cuMemAdvise(dptr, count, advice, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE});
}

// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell.
CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream)
{
// flags is reserved and must be 0 in CUDA 13
return cuMemPrefetchAsync(dptr, count, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE}, 0, hStream);
}
#endif

5 changes: 5 additions & 0 deletions cbits/stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,11 @@ CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
#endif

#if CUDA_VERSION >= 13000
CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device);
CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream);
#endif

#ifdef __cplusplus
}
#endif
Expand Down
6 changes: 3 additions & 3 deletions cuda.cabal
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cabal-version: 1.24

Name: cuda
Version: 0.12.8.0
Version: 0.13.0.0
Synopsis: FFI binding to the CUDA interface for programming NVIDIA GPUs
Description:
The CUDA library provides a direct, general purpose C-like SPMD programming
Expand Down Expand Up @@ -30,7 +30,7 @@ Description:
.
* "Foreign.CUDA.Runtime"
.
Tested with library versions up to CUDA-12.8. See also the
Tested with library versions up to CUDA-13.0. See also the
<https://travis-ci.org/tmcdonell/cuda travis-ci.org> build matrix for
version compatibility.
.
Expand Down Expand Up @@ -177,6 +177,6 @@ source-repository head
source-repository this
type: git
location: https://github.com/tmcdonell/cuda
tag: v0.12.8.0
tag: v0.13.0.0

-- vim: nospell
71 changes: 55 additions & 16 deletions examples/src/deviceQueryDrv/DeviceQuery.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
module Main where

import Control.Monad
import Foreign.Marshal.Utils ( toBool )
import Numeric
import Prelude hiding ( (<>) )
import Text.PrettyPrint
Expand All @@ -15,6 +16,31 @@ import Foreign.CUDA.Analysis as CUDA
import qualified Foreign.CUDA.Driver as CUDA


-- In CUDA 13, a number of device properties became things only queryable using
-- cu(da)DeviceGetAttribute. This data type captures those.
$(if CUDA.libraryVersion >= 13000 then [d|
data AttrProperties = AttrProperties
{ clockRate :: !Int -- ^ Clock frequency in kilohertz
, memClockRate :: !Int -- ^ Peak memory clock frequency in kilohertz
, computeMode :: !ComputeMode
, kernelExecTimeoutEnabled :: !Bool -- ^ Whether there is a runtime limit on kernels
, singleToDoublePerfRatio :: !Int -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance
}
getAttrProperties :: Device -> IO AttrProperties
getAttrProperties d = do
clockRate <- CUDA.attribute d CUDA.ClockRate
memClockRate <- CUDA.attribute d CUDA.MemoryClockRate
computeMode <- toEnum <$> CUDA.attribute d CUDA.ComputeMode
kernelExecTimeoutEnabled <- toBool <$> CUDA.attribute d CUDA.KernelExecTimeout
singleToDoublePerfRatio <- CUDA.attribute d CUDA.SingleToDoublePrecisionPerfRatio
return AttrProperties{..}
|] else [d|
data AttrProperties = AttrProperties {}
getAttrProperties :: Device -> IO AttrProperties
getAttrProperties _ = return AttrProperties
|])


main :: IO ()
main = do
version <- CUDA.driverVersion
Expand All @@ -32,16 +58,17 @@ main = do
infos <- forM [0 .. numDevices-1] $ \n -> do
dev <- CUDA.device n
prp <- CUDA.props dev
return (n, dev, prp)
prp2 <- getAttrProperties dev
return (n, dev, prp, prp2)

forM_ infos $ \(n, dev, prp) -> do
forM_ infos $ \(n, dev, prp, prp2) -> do
p2p <- statP2P dev prp infos
printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp)
printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp prp2)
unless (null p2p) $ printf "%s\n" p2p


statDevice :: DeviceProperties -> String
statDevice dev@DeviceProperties{..} =
statDevice :: DeviceProperties -> AttrProperties -> String
statDevice dev@DeviceProperties{..} AttrProperties{..} =
let
DeviceResources{..} = deviceResources dev

Expand Down Expand Up @@ -69,22 +96,34 @@ statDevice dev@DeviceProperties{..} =
,(" 2D:", grid maxTextureDim2D)
,(" 3D:", cube maxTextureDim3D)
,("Texture alignment:", text $ showBytes textureAlignment)
,("Maximum memory pitch:", text $ showBytes memPitch)
,("Concurrent kernel execution:", bool concurrentKernels)
,("Concurrent copy and execution:", bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))
,("Runtime limit on kernel execution:", bool kernelExecTimeoutEnabled)
,("Maximum memory pitch:", text $ showBytes memPitch)]++

$(if CUDA.libraryVersion >= 13000 then [|
[("Concurrent copy and kernel execution:", bool concurrentKernels <> text (printf " with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))]
|] else [|
[("Concurrent kernel execution:", bool concurrentKernels)
,("Concurrent copy and execution:", bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))]
|])++

[("Runtime limit on kernel execution:", bool kernelExecTimeoutEnabled)
,("Integrated GPU sharing host memory:", bool integrated)
,("Host page-locked memory mapping:", bool canMapHostMemory)
,("ECC memory support:", bool eccEnabled)
,("Unified addressing (UVA):", bool unifiedAddressing)]++
#if __GLASGOW_HASKELL__ > 710

$(if CUDA.libraryVersion >= 8000 then [|
[("Single to double precision performance:", text $ printf "%d : 1" singleToDoublePerfRatio)
,("Supports compute pre-emption:", bool preemption)]|] else [|[]|])++
,("Supports compute pre-emption:", bool preemption)]
|] else [|[]|])++

$(if CUDA.libraryVersion >= 9000 then [|
[("Supports cooperative launch:", bool cooperativeLaunch)
,("Supports multi-device cooperative launch:", bool cooperativeLaunchMultiDevice)]|] else [|[]|])++
#endif
[("Supports cooperative launch:", bool cooperativeLaunch)]
|] else [|[]|])++

$(if CUDA.libraryVersion >= 9000 && CUDA.libraryVersion < 13000 then [|
[("Supports multi-device cooperative launch:", bool cooperativeLaunchMultiDevice)]
|] else [|[]|])++

[("PCI bus/location:", int (busID pciInfo) <> char '/' <> int (deviceID pciInfo))
,("Compute mode:", text (show computeMode))
]
Expand All @@ -94,7 +133,7 @@ statDevice dev@DeviceProperties{..} =
$ text (describe computeMode)


statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties)] -> IO String
statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties, AttrProperties)] -> IO String
statP2P dev prp infos
| CUDA.libraryVersion < 4000 = return []

Expand All @@ -103,7 +142,7 @@ statP2P dev prp infos
| otherwise
= let
go [] = return []
go ((m, peer, pp):is) =
go ((m, peer, pp, _):is) =
if dev == peer
then go is
else do
Expand Down
25 changes: 23 additions & 2 deletions src/Foreign/CUDA/Analysis/Device.chs
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,19 @@ import Debug.Trace
-- |
-- The compute mode the device is currently in
--
#if CUDA_VERSION < 13000
{# enum CUcomputemode as ComputeMode
{ underscoreToCase }
with prefix="CU_COMPUTEMODE" deriving (Eq, Show) #}
#else
{# enum cudaComputeMode as ComputeMode
{ }
with prefix="cudaComputeMode" deriving (Eq, Show) #}
#endif

instance Describe ComputeMode where
describe Default = "Multiple contexts are allowed on the device simultaneously"
#if CUDA_VERSION < 8000
#if CUDA_VERSION < 8000 || CUDA_VERSION >= 13000
describe Exclusive = "Only one context used by a single thread can be present on this device at a time"
#endif
describe Prohibited = "No contexts can be created on this device at this time"
Expand Down Expand Up @@ -69,7 +75,10 @@ cap a b = let a' = fromIntegral a in
--}

-- |
-- The properties of a compute device
-- The properties of a compute device, mirroring @struct cudaDeviceProp@ in CUDA.
--
-- In CUDA-13.0, a number of fields were removed from this struct and are now
-- only available by querying individual attributes on the device.
--
data DeviceProperties = DeviceProperties
{
Expand All @@ -91,16 +100,22 @@ data DeviceProperties = DeviceProperties
, maxTextureDim2D :: !(Int,Int)
, maxTextureDim3D :: !(Int,Int,Int)
#endif
#if CUDA_VERSION < 13000
, clockRate :: !Int -- ^ Clock frequency in kilohertz
#endif
, multiProcessorCount :: !Int -- ^ Number of multiprocessors on the device
, memPitch :: !Int64 -- ^ Maximum pitch in bytes allowed by memory copies
#if CUDA_VERSION >= 4000
, memBusWidth :: !Int -- ^ Global memory bus width in bits
#if CUDA_VERSION < 13000
, memClockRate :: !Int -- ^ Peak memory clock frequency in kilohertz
#endif
#endif
, textureAlignment :: !Int64 -- ^ Alignment requirement for textures
#if CUDA_VERSION < 13000
, computeMode :: !ComputeMode
, deviceOverlap :: !Bool -- ^ Device can concurrently copy memory and execute a kernel
#endif
#if CUDA_VERSION >= 3000
, concurrentKernels :: !Bool -- ^ Device can possibly execute multiple kernels concurrently
, eccEnabled :: !Bool -- ^ Device supports and has enabled error correction
Expand All @@ -111,7 +126,9 @@ data DeviceProperties = DeviceProperties
, pciInfo :: !PCI -- ^ PCI device information for the device
, tccDriverEnabled :: !Bool -- ^ Whether this is a Tesla device using the TCC driver
#endif
#if CUDA_VERSION < 13000
, kernelExecTimeoutEnabled :: !Bool -- ^ Whether there is a runtime limit on kernels
#endif
, integrated :: !Bool -- ^ As opposed to discrete
, canMapHostMemory :: !Bool -- ^ Device can use pinned memory
#if CUDA_VERSION >= 4000
Expand All @@ -129,11 +146,15 @@ data DeviceProperties = DeviceProperties
#endif
#if CUDA_VERSION >= 8000
, preemption :: !Bool -- ^ Device supports compute pre-emption
#if CUDA_VERSION < 13000
, singleToDoublePerfRatio :: !Int -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance
#endif
#endif
#if CUDA_VERSION >= 9000
, cooperativeLaunch :: !Bool -- ^ Device supports launching cooperative kernels
#if CUDA_VERSION < 13000
, cooperativeLaunchMultiDevice :: !Bool -- ^ Device can participate in cooperative multi-device kernels
#endif
#endif
}
deriving (Show)
Expand Down
22 changes: 21 additions & 1 deletion src/Foreign/CUDA/Driver/Device.chs
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,10 @@ attribute !d !a = cuDeviceGetAttribute a d
peekS s _ = peekCString s


-- | Returns a UUID for the device
-- | Returns a UUID for the device.
--
-- Since CUDA-13: If the device is in MIG mode, this function returns its MIG
-- UUID which uniquely identifies the subscribed MIG compute instance.
--
-- Requires CUDA-9.2
--
Expand All @@ -265,10 +268,17 @@ uuid !dev =
unpack ptr
where
{-# INLINE cuDeviceGetUuid #-}
#if CUDA_VERSION < 13000
{# fun unsafe cuDeviceGetUuid
{ `Ptr ()'
, useDevice `Device'
} -> `()' checkStatus*- #}
#else
{# fun unsafe cuDeviceGetUuid_v2 as cuDeviceGetUuid
{ `Ptr ()'
, useDevice `Device'
} -> `()' checkStatus*- #}
#endif

{-# INLINE unpack #-}
unpack :: Ptr () -> IO UUID
Expand Down Expand Up @@ -319,7 +329,9 @@ props !d = do
sharedMemPerBlock <- fromIntegral <$> attribute d SharedMemoryPerBlock
memPitch <- fromIntegral <$> attribute d MaxPitch
textureAlignment <- fromIntegral <$> attribute d TextureAlignment
#if CUDA_VERSION < 13000
clockRate <- attribute d ClockRate
#endif
warpSize <- attribute d WarpSize
regsPerBlock <- attribute d RegistersPerBlock
maxThreadsPerBlock <- attribute d MaxThreadsPerBlock
Expand All @@ -333,9 +345,11 @@ props !d = do
computeCapability <- capability d
totalGlobalMem <- totalMem d
multiProcessorCount <- attribute d MultiprocessorCount
#if CUDA_VERSION < 13000
computeMode <- toEnum <$> attribute d ComputeMode
deviceOverlap <- toBool <$> attribute d GpuOverlap
kernelExecTimeoutEnabled <- toBool <$> attribute d KernelExecTimeout
#endif
integrated <- toBool <$> attribute d Integrated
canMapHostMemory <- toBool <$> attribute d CanMapHostMemory
#if CUDA_VERSION >= 3000
Expand All @@ -350,7 +364,9 @@ props !d = do
cacheMemL2 <- attribute d L2CacheSize
maxThreadsPerMultiProcessor <- attribute d MaxThreadsPerMultiprocessor
memBusWidth <- attribute d GlobalMemoryBusWidth
#if CUDA_VERSION < 13000
memClockRate <- attribute d MemoryClockRate
#endif
pciInfo <- PCI <$> attribute d PciBusId <*> attribute d PciDeviceId <*> attribute d PciDomainId
unifiedAddressing <- toBool <$> attribute d UnifiedAddressing
tccDriverEnabled <- toBool <$> attribute d TccDriver
Expand All @@ -367,11 +383,15 @@ props !d = do
#endif
#if CUDA_VERSION >= 8000
preemption <- toBool <$> attribute d ComputePreemptionSupported
#if CUDA_VERSION < 13000
singleToDoublePerfRatio <- attribute d SingleToDoublePrecisionPerfRatio
#endif
#endif
#if CUDA_VERSION >= 9000
cooperativeLaunch <- toBool <$> attribute d CooperativeLaunch
#if CUDA_VERSION < 13000
cooperativeLaunchMultiDevice <- toBool <$> attribute d CooperativeMultiDeviceLaunch
#endif
#endif

return DeviceProperties{..}
Expand Down
Loading
Loading