Skip to content

Commit bdbff27

Browse files
authored
Merge pull request #80 from tomsmeding/cuda-12
Cuda 12 and related patches
2 parents 5bc08e3 + 26083f0 commit bdbff27

File tree

12 files changed

+214
-565
lines changed

12 files changed

+214
-565
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,19 @@ package _DOES NOT_ follow the PVP, or indeed any sensible version scheme,
1010
because NVIDIA are A-OK introducing breaking changes in minor updates.
1111

1212

13+
## [0.12.8.0] - ???
14+
### Added
15+
* Support for CUDA-12
16+
- Thanks to @noahmartinwilliams on GitHub for helping out!
17+
18+
### Removed
19+
* The following modules have been deprecated for a long time, and have
20+
finally been removed in CUDA-12:
21+
- `Foreign.CUDA.Driver.Texture`
22+
- `Foreign.CUDA.Runtime.Texture`
23+
Support for Texture Objects (their replacement) is missing in these
24+
bindings so far. Contributions welcome.
25+
1326
## [0.11.0.1] - 2023-08-15
1427
### Fixed
1528
* Build fixes for GHC 9.2 .. 9.6

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,6 @@ An incomplete list of missing bindings. Pull requests welcome!
145145
- cuGraphMemAllocNodeGetParams
146146
- cuGraphMemFreeNodeGetParams
147147

148+
### CUDA-12
149+
150+
A lot. PRs welcome.

cbits/stubs.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,6 @@ cudaError_t cudaConfigureCall_simple(unsigned int gridX, unsigned int gridY, uns
2222
}
2323
#endif
2424

25-
CUresult cuTexRefSetAddress2D_simple(CUtexref tex, CUarray_format format, unsigned int numChannels, CUdeviceptr dptr, size_t width, size_t height, size_t pitch)
26-
{
27-
CUDA_ARRAY_DESCRIPTOR desc;
28-
desc.Format = format;
29-
desc.NumChannels = numChannels;
30-
desc.Width = width;
31-
desc.Height = height;
32-
33-
return cuTexRefSetAddress2D(tex, &desc, dptr, pitch);
34-
}
35-
3625
CUresult cuMemcpy2DHtoD(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int dstXInBytes, unsigned int dstY, void* srcHost, unsigned int srcPitch, unsigned int srcXInBytes, unsigned int srcY, unsigned int widthInBytes, unsigned int height)
3726
{
3827
CUDA_MEMCPY2D desc;
@@ -284,11 +273,6 @@ CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N)
284273
{
285274
return cuMemsetD32_v2(dstDevice, ui, N);
286275
}
287-
288-
CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes)
289-
{
290-
return cuTexRefSetAddress_v2(ByteOffset, hTexRef, dptr, bytes);
291-
}
292276
#endif
293277

294278
#if CUDA_VERSION >= 4000

cuda.cabal

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
cabal-version: 1.24
22

33
Name: cuda
4-
Version: 0.11.0.1
4+
Version: 0.12.8.0
55
Synopsis: FFI binding to the CUDA interface for programming NVIDIA GPUs
66
Description:
77
The CUDA library provides a direct, general purpose C-like SPMD programming
@@ -121,7 +121,6 @@ Library
121121
Foreign.CUDA.Driver.Module.Query
122122
Foreign.CUDA.Driver.Profiler
123123
Foreign.CUDA.Driver.Stream
124-
Foreign.CUDA.Driver.Texture
125124
Foreign.CUDA.Driver.Unified
126125
Foreign.CUDA.Driver.Utils
127126

@@ -133,7 +132,6 @@ Library
133132
Foreign.CUDA.Runtime.Exec
134133
Foreign.CUDA.Runtime.Marshal
135134
Foreign.CUDA.Runtime.Stream
136-
Foreign.CUDA.Runtime.Texture
137135
Foreign.CUDA.Runtime.Utils
138136

139137
-- Extras
@@ -151,6 +149,7 @@ Library
151149
build-depends:
152150
base >= 4.7 && < 5
153151
, bytestring >= 0.10.4
152+
, containers
154153
, filepath >= 1.0
155154
, template-haskell
156155
, uuid-types >= 1.0

src/Foreign/CUDA/Analysis/Device.chs

Lines changed: 132 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@ module Foreign.CUDA.Analysis.Device (
1919

2020
#include "cbits/stubs.h"
2121

22+
import qualified Data.Set as Set
23+
import Data.Set (Set)
2224
import Data.Int
25+
import Data.IORef
2326
import Text.Show.Describe
27+
import System.IO.Unsafe
2428

2529
import Debug.Trace
2630

@@ -179,7 +183,17 @@ data DeviceResources = DeviceResources
179183
deviceResources :: DeviceProperties -> DeviceResources
180184
deviceResources = resources . computeCapability
181185
where
182-
-- This is mostly extracted from tables in the CUDA occupancy calculator.
186+
-- Sources:
187+
-- [1] https://github.com/NVIDIA/cuda-samples/blob/7b60178984e96bc09d066077d5455df71fee2a9f/Common/helper_cuda.h
188+
-- - for: coresPerMP (line 643 _ConvertSMVer2Cores)
189+
-- - for: architecture names (line 695 _ConvertSMVer2ArchName)
190+
-- [2] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
191+
-- - for: maxGridsPerDevice
192+
-- - archived here: https://web.archive.org/web/20250409220108/https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
193+
-- - reproduced here: https://en.wikipedia.org/w/index.php?title=CUDA&oldid=1285775690#Technical_specification (note: link to specific page version)
194+
-- [3] NVidia Nsight Compute
195+
-- - for: the other fields
196+
-- - left top "Start Activity" -> "Occupancy Calculator" -> "Launch"; tab "GPU Data"
183197
--
184198
resources compute = case compute of
185199
Compute 1 0 -> resources (Compute 1 1) -- Tesla G80
@@ -283,7 +297,7 @@ deviceResources = resources . computeCapability
283297
}
284298
Compute 5 2 -> (resources (Compute 5 0)) -- Maxwell GM20x
285299
{ sharedMemPerMP = 98304
286-
, maxRegPerBlock = 32768
300+
, maxRegPerBlock = 32768 -- value from [3], wrong in [2]?
287301
, warpAllocUnit = 2
288302
}
289303
Compute 5 3 -> (resources (Compute 5 0)) -- Maxwell GM20B
@@ -318,9 +332,15 @@ deviceResources = resources . computeCapability
318332
}
319333
Compute 6 2 -> (resources (Compute 6 0)) -- Pascal GP10B
320334
{ coresPerMP = 128
321-
, warpsPerMP = 128
322-
, threadBlocksPerMP = 4096
323-
, maxRegPerBlock = 32768
335+
-- Commit 4f75ea889c2ade2bd3eab377b51bb5bbd28bfbae changed warpsPerMP
336+
-- to 128, but [2] and [3] say 64 like CC 6.0; reverted back to 64 to
337+
-- match NVIDIA documentation.
338+
-- That commit also changed threadsPerMP (later mistakenly translated
339+
-- to threadBlocksPerMP in 9df19adec8efc9df761deab40cf04d27810d97d3)
340+
-- from 2048 to 4096, but again [2] and [3] retain 2048 so we keep it
341+
-- at that.
342+
, warpsPerMP = 64
343+
, maxRegPerBlock = 32768 -- value from [2], wrong in [3]?
324344
, warpAllocUnit = 4
325345
, maxGridsPerDevice = 16
326346
}
@@ -346,7 +366,7 @@ deviceResources = resources . computeCapability
346366

347367
Compute 7 2 -> (resources (Compute 7 0)) -- Volta GV10B
348368
{ maxGridsPerDevice = 16
349-
, maxSharedMemPerBlock = 49152
369+
, maxSharedMemPerBlock = 49152 -- unsure why this is here; [2] and [3] say still 98304
350370
}
351371

352372
Compute 7 5 -> (resources (Compute 7 0)) -- Turing TU1xx
@@ -376,14 +396,91 @@ deviceResources = resources . computeCapability
376396
, warpRegAllocUnit = 256
377397
, maxGridsPerDevice = 128
378398
}
379-
380399
Compute 8 6 -> (resources (Compute 8 0)) -- Ampere GA102
381-
{ warpsPerMP = 48
400+
{ coresPerMP = 128
401+
, warpsPerMP = 48
382402
, threadsPerMP = 1536
383403
, threadBlocksPerMP = 16
384404
, sharedMemPerMP = 102400
385405
, maxSharedMemPerBlock = 102400
386406
}
407+
Compute 8 7 -> (resources (Compute 8 0)) -- Ampere
408+
{ coresPerMP = 128
409+
, warpsPerMP = 48
410+
, threadsPerMP = 1536
411+
, threadBlocksPerMP = 16
412+
}
413+
Compute 8 9 -> (resources (Compute 8 0)) -- Ada
414+
{ coresPerMP = 128
415+
, warpsPerMP = 48
416+
, threadsPerMP = 1536
417+
, threadBlocksPerMP = 24
418+
, sharedMemPerMP = 102400
419+
, maxSharedMemPerBlock = 102400
420+
}
421+
422+
Compute 9 0 -> DeviceResources -- Hopper
423+
{ threadsPerWarp = 32
424+
, coresPerMP = 128
425+
, warpsPerMP = 64
426+
, threadsPerMP = 2048
427+
, threadBlocksPerMP = 32
428+
, sharedMemPerMP = 233472
429+
, maxSharedMemPerBlock = 233472
430+
, regFileSizePerMP = 65536
431+
, maxRegPerBlock = 65536
432+
, regAllocUnit = 256
433+
, regAllocationStyle = Warp
434+
, maxRegPerThread = 255
435+
, sharedMemAllocUnit = 128
436+
, warpAllocUnit = 4
437+
, warpRegAllocUnit = 256
438+
, maxGridsPerDevice = 128
439+
}
440+
441+
Compute 10 0 -> DeviceResources -- Blackwell
442+
{ threadsPerWarp = 32
443+
, coresPerMP = 128
444+
, warpsPerMP = 64
445+
, threadsPerMP = 2048
446+
, threadBlocksPerMP = 32
447+
, sharedMemPerMP = 233472
448+
, maxSharedMemPerBlock = 233472
449+
, regFileSizePerMP = 65536
450+
, maxRegPerBlock = 65536
451+
, regAllocUnit = 256
452+
, regAllocationStyle = Warp
453+
, maxRegPerThread = 255
454+
, sharedMemAllocUnit = 128
455+
, warpAllocUnit = 4
456+
, warpRegAllocUnit = 256
457+
, maxGridsPerDevice = 128
458+
}
459+
Compute 10 1 -> (resources (Compute 10 0)) -- Blackwell
460+
{ warpsPerMP = 48
461+
, threadsPerMP = 1536
462+
, threadBlocksPerMP = 24
463+
}
464+
465+
Compute 12 0 -> DeviceResources -- Blackwell
466+
{ threadsPerWarp = 32
467+
, coresPerMP = 128
468+
, warpsPerMP = 48
469+
, threadsPerMP = 1536
470+
, threadBlocksPerMP = 24
471+
, sharedMemPerMP = 102400
472+
, maxSharedMemPerBlock = 102400
473+
, regFileSizePerMP = 65536
474+
, maxRegPerBlock = 65536
475+
, regAllocUnit = 256
476+
, regAllocationStyle = Warp
477+
, maxRegPerThread = 255
478+
, sharedMemAllocUnit = 128
479+
, warpAllocUnit = 4
480+
, warpRegAllocUnit = 256
481+
, maxGridsPerDevice = 128
482+
}
483+
387484

388485
-- Something might have gone wrong, or the library just needs to be
389486
-- updated for the next generation of hardware, in which case we just want
@@ -393,7 +490,30 @@ deviceResources = resources . computeCapability
393490
-- However, it should be OK because all library functions run in IO, so it
394491
-- is likely the user code is as well.
395492
--
396-
_ -> trace warning $ resources (Compute 6 0)
397-
where warning = unlines [ "*** Warning: Unknown CUDA device compute capability: " ++ show compute
398-
, "*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues" ]
399-
493+
_ -> case warningForCC compute of
494+
Just warning -> trace warning defaultResources
495+
Nothing -> defaultResources
496+
497+
defaultResources = resources (Compute 6 0)
498+
499+
-- All this logic is to ensure the warning is only shown once per unknown
500+
-- compute capability. This sounds not worth it, but in practice, it is:
501+
-- empirically, an unknown compute capability often leads to /screenfuls/
502+
-- of warnings in accelerate-llvm-ptx otherwise.
503+
{-# NOINLINE warningForCC #-}
504+
warningForCC :: Compute -> Maybe String
505+
warningForCC compute = unsafePerformIO $ do
506+
unseen <- atomicModifyIORef' warningShown $ \seen ->
507+
-- This is just one tree traversal; lookup-insert would be two traversals.
508+
let seen' = Set.insert compute seen
509+
in (seen', Set.size seen' > Set.size seen)
510+
return $ if unseen
511+
then Just $ unlines
512+
[ "*** Warning: Unknown CUDA device compute capability: " ++ show compute
513+
, "*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues"
514+
, "*** (This warning will only be shown once for this compute capability)" ]
515+
else Nothing
516+
517+
{-# NOINLINE warningShown #-}
518+
warningShown :: IORef (Set Compute)
519+
warningShown = unsafePerformIO $ newIORef mempty

src/Foreign/CUDA/Analysis/Occupancy.hs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@
2828
-- the number in the @.cubin@ file to the amount you dynamically allocate at run
2929
-- time to get the correct shared memory usage.
3030
--
31+
-- __Warning__: Like the official Occupancy Calculator in NVidia Nsight
32+
-- Compute, the calculator in this module does not support or consider Thread
33+
-- Block Clusters
34+
-- (<https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-block-clusters>)
35+
-- that have been introduced with compute capability 9.0 (Hopper). If you use
36+
-- thread block clusters in your kernels, the results you get with the
37+
-- functions in this module may not be accurate. Profile and measure.
38+
--
3139
-- /Notes About Occupancy/
3240
--
3341
-- Higher occupancy does not necessarily mean higher performance. If a kernel

src/Foreign/CUDA/Driver/Graph/Capture.chs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,23 @@ status = requireSDK 'status 10.0
152152
#if CUDA_VERSION < 10010
153153
info :: Stream -> IO (Status, Int64)
154154
info = requireSDK 'info 10.1
155-
#else
155+
#elif CUDA_VERSION < 12000
156156
{# fun unsafe cuStreamGetCaptureInfo as info
157157
{ useStream `Stream'
158158
, alloca- `Status' peekEnum*
159159
, alloca- `Int64' peekIntConv*
160160
}
161161
-> `()' checkStatus*- #}
162+
#else
163+
{# fun unsafe cuStreamGetCaptureInfo_v2 as info
164+
{ useStream `Stream'
165+
, alloca- `Status' peekEnum*
166+
, alloca- `Int64' peekIntConv*
167+
, alloca- `Graph'
168+
, alloca- `Node'
169+
, alloca- `CSize'
170+
}
171+
-> `()' checkStatus*- #}
162172
#endif
163173

164174

src/Foreign/CUDA/Driver/Module/Query.chs

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
module Foreign.CUDA.Driver.Module.Query (
1717

1818
-- ** Querying module inhabitants
19-
getFun, getPtr, getTex,
19+
getFun, getPtr,
2020

2121
) where
2222

@@ -28,7 +28,6 @@ import Foreign.CUDA.Driver.Error
2828
import Foreign.CUDA.Driver.Exec
2929
import Foreign.CUDA.Driver.Marshal ( peekDeviceHandle )
3030
import Foreign.CUDA.Driver.Module.Base
31-
import Foreign.CUDA.Driver.Texture
3231
import Foreign.CUDA.Internal.C2HS
3332
import Foreign.CUDA.Ptr
3433

@@ -92,26 +91,6 @@ getPtr !mdl !name = do
9291
-> `Status' cToEnum #}
9392

9493

95-
-- |
96-
-- Return a handle to a texture reference. This texture reference handle
97-
-- should not be destroyed, as the texture will be destroyed automatically
98-
-- when the module is unloaded.
99-
--
100-
-- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9607dcbf911c16420d5264273f2b5608>
101-
--
102-
{-# INLINEABLE getTex #-}
103-
getTex :: Module -> ShortByteString -> IO Texture
104-
getTex !mdl !name = resultIfFound "texture" name =<< cuModuleGetTexRef mdl name
105-
106-
{-# INLINE cuModuleGetTexRef #-}
107-
{# fun unsafe cuModuleGetTexRef
108-
{ alloca- `Texture' peekTex*
109-
, useModule `Module'
110-
, useAsCString* `ShortByteString'
111-
}
112-
-> `Status' cToEnum #}
113-
114-
11594
--------------------------------------------------------------------------------
11695
-- Internal
11796
--------------------------------------------------------------------------------

0 commit comments

Comments
 (0)