Skip to content

Commit b19e47b

Browse files
authored
Update to CUDA 12.9. (#2772)
1 parent 8e50847 commit b19e47b

File tree

19 files changed

+937
-979
lines changed

19 files changed

+937
-979
lines changed

.buildkite/pipeline.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ steps:
6666
matrix:
6767
setup:
6868
cuda:
69+
- "12.9"
6970
- "12.8"
7071
- "12.6"
7172
- "12.5"

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ AbstractFFTs = "0.4, 0.5, 1.0"
5252
Adapt = "4"
5353
BFloat16s = "0.2, 0.3, 0.4, 0.5"
5454
CEnum = "0.2, 0.3, 0.4, 0.5"
55-
CUDA_Driver_jll = "0.12"
55+
CUDA_Driver_jll = "0.13"
5656
CUDA_Runtime_Discovery = "0.3.3"
57-
CUDA_Runtime_jll = "0.16"
57+
CUDA_Runtime_jll = "0.17"
5858
ChainRulesCore = "1"
5959
Crayons = "4"
6060
DataFrames = "1"

lib/cublas/libcublas.jl

Lines changed: 251 additions & 353 deletions
Large diffs are not rendered by default.

lib/cudadrv/libcuda.jl

Lines changed: 137 additions & 137 deletions
Large diffs are not rendered by default.

lib/cudnn/src/cuDNN.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ module cuDNN
99

1010
using CUDA
1111
using CUDA.APIUtils
12-
using CUDA: CUstream, libraryPropertyType
12+
using CUDA: CUstream, CUgraph, libraryPropertyType
1313
using CUDA: retry_reclaim, isdebug, initialize_context
1414

1515
using CEnum: @cenum

lib/cudnn/src/libcudnn.jl

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ using CEnum
55

66
# cuDNN uses CUDA runtime objects, which are compatible with our driver usage
77
const cudaStream_t = CUstream
8+
const cudaGraph_t = CUgraph
89

910
# outlined functionality to avoid GC frame allocation
1011
@noinline function throw_api_error(res)
@@ -16,11 +17,9 @@ const cudaStream_t = CUstream
1617
end
1718

1819
@inline function check(f)
19-
function retry_if(res)
20-
return res in (CUDNN_STATUS_NOT_INITIALIZED,
21-
CUDNN_STATUS_ALLOC_FAILED,
22-
CUDNN_STATUS_INTERNAL_ERROR)
23-
end
20+
retry_if(res) = res in (CUDNN_STATUS_NOT_INITIALIZED,
21+
CUDNN_STATUS_ALLOC_FAILED,
22+
CUDNN_STATUS_INTERNAL_ERROR)
2423
res = retry_reclaim(f, retry_if)
2524

2625
if res != CUDNN_STATUS_SUCCESS
@@ -48,6 +47,8 @@ end
4847
CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH = 2008
4948
CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009
5049
CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE = 2010
50+
CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011
51+
CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE = 2012
5152
CUDNN_STATUS_NOT_SUPPORTED = 3000
5253
CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN = 3001
5354
CUDNN_STATUS_NOT_SUPPORTED_SHAPE = 3002
@@ -61,6 +62,7 @@ end
6162
CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT = 3010
6263
CUDNN_STATUS_NOT_SUPPORTED_PADDING = 3011
6364
CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM = 3012
65+
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API = 3013
6466
CUDNN_STATUS_INTERNAL_ERROR = 4000
6567
CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED = 4001
6668
CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE = 4002
@@ -166,6 +168,8 @@ end
166168
CUDNN_DATA_FP8_E4M3 = 12
167169
CUDNN_DATA_FP8_E5M2 = 13
168170
CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14
171+
CUDNN_DATA_FP8_E8M0 = 15
172+
CUDNN_DATA_FP4_E2M1 = 16
169173
end
170174

171175
@cenum cudnnMathType_t::UInt32 begin
@@ -449,6 +453,7 @@ end
449453
CUDNN_ATTR_OPERATIONGRAPH_OPS = 801
450454
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802
451455
CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803
456+
CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY = 804
452457
CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900
453458
CUDNN_ATTR_TENSOR_DATA_TYPE = 901
454459
CUDNN_ATTR_TENSOR_DIMENSIONS = 902
@@ -577,7 +582,19 @@ end
577582
CUDNN_ATTR_OPERATION_RNG_SEED = 2311
578583
CUDNN_ATTR_OPERATION_RNG_DESC = 2312
579584
CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313
580-
CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2400
585+
CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH = 2400
586+
CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401
587+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC = 2500
588+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC = 2501
589+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502
590+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC = 2503
591+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504
592+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_DENOM_FACTOR_MODE = 2505
593+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC = 2600
594+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601
595+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC = 2602
596+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC = 2603
597+
CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604
581598
end
582599

583600
@cenum cudnnBackendAttributeType_t::UInt32 begin
@@ -650,6 +667,8 @@ end
650667
CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR = 33
651668
CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR = 34
652669
CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR = 35
670+
CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR = 36
671+
CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR = 37
653672
end
654673

655674
@cenum cudnnBackendNumericalNote_t::UInt32 begin
@@ -670,7 +689,8 @@ end
670689
CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0
671690
CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1
672691
CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2
673-
CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 3
692+
CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3
693+
CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 4
674694
end
675695

676696
@cenum cudnnBackendKnobType_t::UInt32 begin
@@ -711,7 +731,13 @@ end
711731
CUDNN_KNOB_TYPE_TILE_ROWS = 34
712732
CUDNN_KNOB_TYPE_TILE_COLS = 35
713733
CUDNN_KNOB_TYPE_LOAD_SIZE = 36
714-
CUDNN_KNOB_TYPE_COUNTS = 37
734+
CUDNN_KNOB_TYPE_CTA_COUNT = 37
735+
CUDNN_KNOB_TYPE_STREAM_K = 38
736+
CUDNN_KNOB_TYPE_SPLIT_P_SLC = 39
737+
CUDNN_KNOB_TYPE_TILE_M = 40
738+
CUDNN_KNOB_TYPE_TILE_N = 41
739+
CUDNN_KNOB_TYPE_WARP_SPEC_CFG = 42
740+
CUDNN_KNOB_TYPE_COUNTS = 43
715741
end
716742

717743
@cenum cudnnBackendLayoutType_t::UInt32 begin
@@ -734,6 +760,7 @@ end
734760
CUDNN_TENSOR_REORDERING_NONE = 0
735761
CUDNN_TENSOR_REORDERING_INT8x32 = 1
736762
CUDNN_TENSOR_REORDERING_F16x16 = 2
763+
CUDNN_TENSOR_REORDERING_F8_128x4 = 3
737764
end
738765

739766
@cenum cudnnPaddingMode_t::UInt32 begin
@@ -748,6 +775,7 @@ end
748775
CUDNN_BATCH_NORM = 2
749776
CUDNN_GROUP_NORM = 3
750777
CUDNN_RMS_NORM = 4
778+
CUDNN_ADA_LAYER_NORM = 5
751779
end
752780

753781
@cenum cudnnBackendNormFwdPhase_t::UInt32 begin
@@ -805,6 +833,22 @@ end
805833
variantPack::cudnnBackendDescriptor_t)::cudnnStatus_t
806834
end
807835

836+
@checked function cudnnBackendPopulateCudaGraph(handle, executionPlan, variantPack, graph)
837+
initialize_context()
838+
@gcsafe_ccall libcudnn.cudnnBackendPopulateCudaGraph(handle::cudnnHandle_t,
839+
executionPlan::cudnnBackendDescriptor_t,
840+
variantPack::cudnnBackendDescriptor_t,
841+
graph::cudaGraph_t)::cudnnStatus_t
842+
end
843+
844+
@checked function cudnnBackendUpdateCudaGraph(handle, executionPlan, variantPack, graph)
845+
initialize_context()
846+
@gcsafe_ccall libcudnn.cudnnBackendUpdateCudaGraph(handle::cudnnHandle_t,
847+
executionPlan::cudnnBackendDescriptor_t,
848+
variantPack::cudnnBackendDescriptor_t,
849+
graph::cudaGraph_t)::cudnnStatus_t
850+
end
851+
808852
mutable struct cudnnTensorStruct end
809853

810854
const cudnnTensorDescriptor_t = Ptr{cudnnTensorStruct}
@@ -3542,7 +3586,7 @@ end
35423586
varPack::cudnnFusedOpsVariantParamPack_t)::cudnnStatus_t
35433587
end
35443588

3545-
const CUDNN_MAX_SM_MAJOR_NUMBER = 9
3589+
const CUDNN_MAX_SM_MAJOR_NUMBER = 12
35463590

35473591
const CUDNN_MAX_SM_MINOR_NUMBER = 0
35483592

0 commit comments

Comments
 (0)