@@ -139,6 +139,16 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
139139// MISC
140140//
141141
142+ defvar WARP_SIZE = 32;
143+
144+ defvar MAX_GRID_SIZE_X = 0x7fffffff;
145+ defvar MAX_GRID_SIZE_Y = 0xffff;
146+ defvar MAX_GRID_SIZE_Z = 0xffff;
147+
148+ defvar MAX_BLOCK_SIZE_X = 1024;
149+ defvar MAX_BLOCK_SIZE_Y = 1024;
150+ defvar MAX_BLOCK_SIZE_Z = 64;
151+
142152// Helper class that concatenates list elements with
143153// a given separator 'sep' and returns the result.
144154// Handles empty strings.
@@ -4747,26 +4757,33 @@ def int_nvvm_sust_p_3d_v4i32_trap
47474757
47484758// Accessing special registers.
47494759
4750- class PTXReadSRegIntrinsicNB_r32
4751- : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
4752- class PTXReadSRegIntrinsic_r32<string name>
4753- : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4760+ class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
4761+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [],
4762+ !listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
47544763
4755- multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
4764+ class PTXReadSRegIntrinsic_r32<string name,
4765+ list<IntrinsicProperty> properties = []>
4766+ : PTXReadSRegIntrinsicNB_r32<properties>,
4767+ ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4768+
4769+ multiclass PTXReadSRegIntrinsic_v4i32<string regname,
4770+ list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
47564771// FIXME: Do we need the 128-bit integer type version?
47574772// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
47584773
47594774// FIXME: Enable this once v4i32 support is enabled in back-end.
47604775// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
4761- foreach suffix = ["_x", "_y", "_z", "_w"] in
4762- def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
4776+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4777+ foreach i = !range(suffixes) in
4778+ def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
47634779}
47644780
47654781// Same, but without automatic clang builtins. It will be used for
47664782// registers that require particular GPU or PTX version.
4767- multiclass PTXReadSRegIntrinsicNB_v4i32 {
4768- foreach suffix = ["_x", "_y", "_z", "_w"] in
4769- def suffix : PTXReadSRegIntrinsicNB_r32;
4783+ multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4784+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4785+ foreach i = !range(suffixes) in
4786+ def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
47704787}
47714788
47724789class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4799,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
47824799 : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
47834800 ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
47844801
4785- defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
4786- defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
4802+ defm int_nvvm_read_ptx_sreg_tid
4803+ : PTXReadSRegIntrinsic_v4i32<"tid",
4804+ [[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
4805+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
4806+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
4807+ [Range<RetIndex, 0, 1>]]>;
4808+
4809+ defm int_nvvm_read_ptx_sreg_ntid
4810+ : PTXReadSRegIntrinsic_v4i32<"ntid",
4811+ [[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
4812+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
4813+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
4814+ [Range<RetIndex, 0, 1>]]>;
4815+
4816+ def int_nvvm_read_ptx_sreg_laneid
4817+ : PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
47874818
4788- def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
47894819def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
47904820def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
47914821
4792- defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
4793- defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
4822+ defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
4823+ [Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
4824+ [Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
4825+ [Range<RetIndex, 0, 1>]];
4826+
4827+ defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
4828+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
4829+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
4830+ [Range<RetIndex, 0, 1>]];
4831+
4832+ defm int_nvvm_read_ptx_sreg_ctaid
4833+ : PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
4834+
4835+ defm int_nvvm_read_ptx_sreg_nctaid
4836+ : PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
47944837
47954838def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
47964839def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4860,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
48174860def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
48184861def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
48194862
4820- def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
4863+ def int_nvvm_read_ptx_sreg_warpsize
4864+ : PTXReadSRegIntrinsic_r32<"warpsize",
4865+ [Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
48214866
48224867// sm90+, PTX7.8+
4823- defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
4824- defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
4825- defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
4826- defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
4868+
4869+ // Note: Since clusters are subdivisions of the grid, we conservatively use the
4870+ // maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
4871+ // practice, the clusterid will likely be much smaller. The CUDA programming
4872+ // guide recommends 8 as a maximum portable value and H100s support 16.
4873+
4874+ defm int_nvvm_read_ptx_sreg_clusterid
4875+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4876+ defm int_nvvm_read_ptx_sreg_nclusterid
4877+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
4878+ defm int_nvvm_read_ptx_sreg_cluster_ctaid
4879+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4880+ defm int_nvvm_read_ptx_sreg_cluster_nctaid
4881+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
48274882
48284883def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
48294884def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
0 commit comments