@@ -5,6 +5,7 @@ using CEnum
5
5
6
6
# cuDNN uses CUDA runtime objects, which are compatible with our driver usage
7
7
const cudaStream_t = CUstream
8
+ const cudaGraph_t = CUgraph
8
9
9
10
# outlined functionality to avoid GC frame allocation
10
11
@noinline function throw_api_error (res)
@@ -16,11 +17,9 @@ const cudaStream_t = CUstream
16
17
end
17
18
18
19
@inline function check (f)
19
- function retry_if (res)
20
- return res in (CUDNN_STATUS_NOT_INITIALIZED,
21
- CUDNN_STATUS_ALLOC_FAILED,
22
- CUDNN_STATUS_INTERNAL_ERROR)
23
- end
20
+ retry_if (res) = res in (CUDNN_STATUS_NOT_INITIALIZED,
21
+ CUDNN_STATUS_ALLOC_FAILED,
22
+ CUDNN_STATUS_INTERNAL_ERROR)
24
23
res = retry_reclaim (f, retry_if)
25
24
26
25
if res != CUDNN_STATUS_SUCCESS
48
47
CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH = 2008
49
48
CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009
50
49
CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE = 2010
50
+ CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011
51
+ CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE = 2012
51
52
CUDNN_STATUS_NOT_SUPPORTED = 3000
52
53
CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN = 3001
53
54
CUDNN_STATUS_NOT_SUPPORTED_SHAPE = 3002
61
62
CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT = 3010
62
63
CUDNN_STATUS_NOT_SUPPORTED_PADDING = 3011
63
64
CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM = 3012
65
+ CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API = 3013
64
66
CUDNN_STATUS_INTERNAL_ERROR = 4000
65
67
CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED = 4001
66
68
CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE = 4002
166
168
CUDNN_DATA_FP8_E4M3 = 12
167
169
CUDNN_DATA_FP8_E5M2 = 13
168
170
CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14
171
+ CUDNN_DATA_FP8_E8M0 = 15
172
+ CUDNN_DATA_FP4_E2M1 = 16
169
173
end
170
174
171
175
@cenum cudnnMathType_t:: UInt32 begin
449
453
CUDNN_ATTR_OPERATIONGRAPH_OPS = 801
450
454
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802
451
455
CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803
456
+ CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY = 804
452
457
CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900
453
458
CUDNN_ATTR_TENSOR_DATA_TYPE = 901
454
459
CUDNN_ATTR_TENSOR_DIMENSIONS = 902
577
582
CUDNN_ATTR_OPERATION_RNG_SEED = 2311
578
583
CUDNN_ATTR_OPERATION_RNG_DESC = 2312
579
584
CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313
580
- CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2400
585
+ CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH = 2400
586
+ CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401
587
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC = 2500
588
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC = 2501
589
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502
590
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC = 2503
591
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504
592
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_DENOM_FACTOR_MODE = 2505
593
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC = 2600
594
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601
595
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC = 2602
596
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC = 2603
597
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604
581
598
end
582
599
583
600
@cenum cudnnBackendAttributeType_t:: UInt32 begin
650
667
CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR = 33
651
668
CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR = 34
652
669
CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR = 35
670
+ CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR = 36
671
+ CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR = 37
653
672
end
654
673
655
674
@cenum cudnnBackendNumericalNote_t:: UInt32 begin
670
689
CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0
671
690
CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1
672
691
CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2
673
- CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 3
692
+ CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3
693
+ CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 4
674
694
end
675
695
676
696
@cenum cudnnBackendKnobType_t:: UInt32 begin
711
731
CUDNN_KNOB_TYPE_TILE_ROWS = 34
712
732
CUDNN_KNOB_TYPE_TILE_COLS = 35
713
733
CUDNN_KNOB_TYPE_LOAD_SIZE = 36
714
- CUDNN_KNOB_TYPE_COUNTS = 37
734
+ CUDNN_KNOB_TYPE_CTA_COUNT = 37
735
+ CUDNN_KNOB_TYPE_STREAM_K = 38
736
+ CUDNN_KNOB_TYPE_SPLIT_P_SLC = 39
737
+ CUDNN_KNOB_TYPE_TILE_M = 40
738
+ CUDNN_KNOB_TYPE_TILE_N = 41
739
+ CUDNN_KNOB_TYPE_WARP_SPEC_CFG = 42
740
+ CUDNN_KNOB_TYPE_COUNTS = 43
715
741
end
716
742
717
743
@cenum cudnnBackendLayoutType_t:: UInt32 begin
734
760
CUDNN_TENSOR_REORDERING_NONE = 0
735
761
CUDNN_TENSOR_REORDERING_INT8x32 = 1
736
762
CUDNN_TENSOR_REORDERING_F16x16 = 2
763
+ CUDNN_TENSOR_REORDERING_F8_128x4 = 3
737
764
end
738
765
739
766
@cenum cudnnPaddingMode_t:: UInt32 begin
748
775
CUDNN_BATCH_NORM = 2
749
776
CUDNN_GROUP_NORM = 3
750
777
CUDNN_RMS_NORM = 4
778
+ CUDNN_ADA_LAYER_NORM = 5
751
779
end
752
780
753
781
@cenum cudnnBackendNormFwdPhase_t:: UInt32 begin
805
833
variantPack:: cudnnBackendDescriptor_t ):: cudnnStatus_t
806
834
end
807
835
836
+ @checked function cudnnBackendPopulateCudaGraph (handle, executionPlan, variantPack, graph)
837
+ initialize_context ()
838
+ @gcsafe_ccall libcudnn. cudnnBackendPopulateCudaGraph (handle:: cudnnHandle_t ,
839
+ executionPlan:: cudnnBackendDescriptor_t ,
840
+ variantPack:: cudnnBackendDescriptor_t ,
841
+ graph:: cudaGraph_t ):: cudnnStatus_t
842
+ end
843
+
844
+ @checked function cudnnBackendUpdateCudaGraph (handle, executionPlan, variantPack, graph)
845
+ initialize_context ()
846
+ @gcsafe_ccall libcudnn. cudnnBackendUpdateCudaGraph (handle:: cudnnHandle_t ,
847
+ executionPlan:: cudnnBackendDescriptor_t ,
848
+ variantPack:: cudnnBackendDescriptor_t ,
849
+ graph:: cudaGraph_t ):: cudnnStatus_t
850
+ end
851
+
808
852
mutable struct cudnnTensorStruct end
809
853
810
854
const cudnnTensorDescriptor_t = Ptr{cudnnTensorStruct}
@@ -3542,7 +3586,7 @@ end
3542
3586
varPack:: cudnnFusedOpsVariantParamPack_t ):: cudnnStatus_t
3543
3587
end
3544
3588
3545
- const CUDNN_MAX_SM_MAJOR_NUMBER = 9
3589
+ const CUDNN_MAX_SM_MAJOR_NUMBER = 12
3546
3590
3547
3591
const CUDNN_MAX_SM_MINOR_NUMBER = 0
3548
3592
0 commit comments