@@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
139139// MISC
140140//
141141
142+ defvar WARP_SIZE = 32;
143+
144+ // Note: the maximum grid size in the x-dimension is the lower value of 65535
145+ // on sm_20. We conservatively use the larger value here as it required for
146+ // sm_30+ and also correct for sm_20.
147+ defvar MAX_GRID_SIZE_X = 0x7fffffff;
148+ defvar MAX_GRID_SIZE_Y = 0xffff;
149+ defvar MAX_GRID_SIZE_Z = 0xffff;
150+
151+ defvar MAX_BLOCK_SIZE_X = 1024;
152+ defvar MAX_BLOCK_SIZE_Y = 1024;
153+ defvar MAX_BLOCK_SIZE_Z = 64;
154+
142155// Helper class that concatenates list elements with
143156// a given separator 'sep' and returns the result.
144157// Handles empty strings.
@@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap
47474760
47484761// Accessing special registers.
47494762
4750- class PTXReadSRegIntrinsicNB_r32
4751- : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
4752- class PTXReadSRegIntrinsic_r32<string name>
4753- : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4763+ class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
4764+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [],
4765+ !listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
47544766
4755- multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
4767+ class PTXReadSRegIntrinsic_r32<string name,
4768+ list<IntrinsicProperty> properties = []>
4769+ : PTXReadSRegIntrinsicNB_r32<properties>,
4770+ ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4771+
4772+ multiclass PTXReadSRegIntrinsic_v4i32<string regname,
4773+ list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4774+ assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
47564775// FIXME: Do we need the 128-bit integer type version?
47574776// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
47584777
47594778// FIXME: Enable this once v4i32 support is enabled in back-end.
47604779// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
4761- foreach suffix = ["_x", "_y", "_z", "_w"] in
4762- def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
4780+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4781+ foreach i = !range(suffixes) in
4782+ def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
47634783}
47644784
47654785// Same, but without automatic clang builtins. It will be used for
47664786// registers that require particular GPU or PTX version.
4767- multiclass PTXReadSRegIntrinsicNB_v4i32 {
4768- foreach suffix = ["_x", "_y", "_z", "_w"] in
4769- def suffix : PTXReadSRegIntrinsicNB_r32;
4787+ multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4788+ assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
4789+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4790+ foreach i = !range(suffixes) in
4791+ def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
47704792}
47714793
47724794class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
47824804 : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
47834805 ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
47844806
4785- defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
4786- defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
4807+ defm int_nvvm_read_ptx_sreg_tid
4808+ : PTXReadSRegIntrinsic_v4i32<"tid",
4809+ [[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
4810+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
4811+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
4812+ [Range<RetIndex, 0, 1>]]>;
4813+
4814+ defm int_nvvm_read_ptx_sreg_ntid
4815+ : PTXReadSRegIntrinsic_v4i32<"ntid",
4816+ [[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
4817+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
4818+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
4819+ [Range<RetIndex, 0, 1>]]>;
4820+
4821+ def int_nvvm_read_ptx_sreg_laneid
4822+ : PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
47874823
4788- def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
47894824def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
47904825def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
47914826
4792- defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
4793- defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
4827+ defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
4828+ [Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
4829+ [Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
4830+ [Range<RetIndex, 0, 1>]];
4831+
4832+ defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
4833+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
4834+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
4835+ [Range<RetIndex, 0, 1>]];
4836+
4837+ defm int_nvvm_read_ptx_sreg_ctaid
4838+ : PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
4839+
4840+ defm int_nvvm_read_ptx_sreg_nctaid
4841+ : PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
47944842
47954843def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
47964844def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
48174865def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
48184866def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
48194867
4820- def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
4868+ def int_nvvm_read_ptx_sreg_warpsize
4869+ : PTXReadSRegIntrinsic_r32<"warpsize",
4870+ [Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
48214871
48224872// sm90+, PTX7.8+
4823- defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
4824- defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
4825- defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
4826- defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
4873+
4874+ // Note: Since clusters are subdivisions of the grid, we conservatively use the
4875+ // maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
4876+ // practice, the clusterid will likely be much smaller. The CUDA programming
4877+ // guide recommends 8 as a maximum portable value and H100s support 16.
4878+
4879+ defm int_nvvm_read_ptx_sreg_clusterid
4880+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4881+ defm int_nvvm_read_ptx_sreg_nclusterid
4882+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
4883+ defm int_nvvm_read_ptx_sreg_cluster_ctaid
4884+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4885+ defm int_nvvm_read_ptx_sreg_cluster_nctaid
4886+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
48274887
48284888def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
48294889def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
0 commit comments