diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 2c550de7a9203..8b87822d3fdda 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr // MISC // +defvar WARP_SIZE = 32; + +// Note: the maximum grid size in the x-dimension is the lower value of 65535 +// on sm_20. We conservatively use the larger value here as it required for +// sm_30+ and also correct for sm_20. +defvar MAX_GRID_SIZE_X = 0x7fffffff; +defvar MAX_GRID_SIZE_Y = 0xffff; +defvar MAX_GRID_SIZE_Z = 0xffff; + +defvar MAX_BLOCK_SIZE_X = 1024; +defvar MAX_BLOCK_SIZE_Y = 1024; +defvar MAX_BLOCK_SIZE_Z = 64; + // Helper class that concatenates list elements with // a given separator 'sep' and returns the result. // Handles empty strings. @@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap // Accessing special registers. -class PTXReadSRegIntrinsicNB_r32 - : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef]>; -class PTXReadSRegIntrinsic_r32 - : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>; +class PTXReadSRegIntrinsicNB_r32 properties = []> + : DefaultAttrsIntrinsic<[llvm_i32_ty], [], + !listconcat([IntrNoMem, IntrSpeculatable, NoUndef], properties)>; -multiclass PTXReadSRegIntrinsic_v4i32 { +class PTXReadSRegIntrinsic_r32 properties = []> + : PTXReadSRegIntrinsicNB_r32, + ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>; + +multiclass PTXReadSRegIntrinsic_v4i32> properties = [[], [], [], []]> { + assert !eq(!size(properties), 4), "properties must be a list of 4 lists"; // FIXME: Do we need the 128-bit integer type version? // def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>; // FIXME: Enable this once v4i32 support is enabled in back-end. // def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>; - foreach suffix = ["_x", "_y", "_z", "_w"] in - def suffix : PTXReadSRegIntrinsic_r32; + defvar suffixes = ["_x", "_y", "_z", "_w"]; + foreach i = !range(suffixes) in + def suffixes[i] : PTXReadSRegIntrinsic_r32; } // Same, but without automatic clang builtins. It will be used for // registers that require particular GPU or PTX version. -multiclass PTXReadSRegIntrinsicNB_v4i32 { - foreach suffix = ["_x", "_y", "_z", "_w"] in - def suffix : PTXReadSRegIntrinsicNB_r32; +multiclass PTXReadSRegIntrinsicNB_v4i32> properties = [[], [], [], []]> { + assert !eq(!size(properties), 4), "properties must be a list of 4 lists"; + defvar suffixes = ["_x", "_y", "_z", "_w"]; + foreach i = !range(suffixes) in + def suffixes[i] : PTXReadSRegIntrinsicNB_r32; } class PTXReadSRegIntrinsic_r64 @@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64 : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef]>, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>; -defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">; -defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">; +defm int_nvvm_read_ptx_sreg_tid + : PTXReadSRegIntrinsic_v4i32<"tid", + [[Range], + [Range], + [Range], + [Range]]>; + +defm int_nvvm_read_ptx_sreg_ntid + : PTXReadSRegIntrinsic_v4i32<"ntid", + [[Range], + [Range], + [Range], + [Range]]>; + +def int_nvvm_read_ptx_sreg_laneid + : PTXReadSRegIntrinsic_r32<"laneid", [Range]>; -def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">; def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">; def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">; -defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">; -defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">; +defvar MAX_GRID_ID_RANGE = [[Range], + [Range], + [Range], + [Range]]; + +defvar MAX_GRID_NID_RANGE = [[Range], + [Range], + [Range], + [Range]]; + +defm int_nvvm_read_ptx_sreg_ctaid + : PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>; + +defm int_nvvm_read_ptx_sreg_nctaid + : PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>; def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">; def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">; @@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">; def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">; def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">; -def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">; +def int_nvvm_read_ptx_sreg_warpsize + : PTXReadSRegIntrinsic_r32<"warpsize", + [Range]>; // sm90+, PTX7.8+ -defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32; -defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32; -defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32; -defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32; + +// Note: Since clusters are subdivisions of the grid, we conservatively use the +// maximum grid size as an upper bound for the clusterid and cluster_ctaid. In +// practice, the clusterid will likely be much smaller. The CUDA programming +// guide recommends 8 as a maximum portable value and H100s support 16. + +defm int_nvvm_read_ptx_sreg_clusterid + : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_nclusterid + : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_cluster_ctaid + : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_cluster_nctaid + : PTXReadSRegIntrinsicNB_v4i32; def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32; def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 61b50b69b4e86..6586f925504f1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -330,6 +330,16 @@ std::optional getOverallReqNTID(const Function &F) { return getVectorProduct(ReqNTID); } +std::optional getOverallClusterRank(const Function &F) { + // maxclusterrank and cluster_dim are mutually exclusive. + if (const auto ClusterRank = getMaxClusterRank(F)) + return ClusterRank; + + // Note: The semantics here are a bit strange. See getMaxNTID. + const auto ClusterDim = getClusterDim(F); + return getVectorProduct(ClusterDim); +} + std::optional getMaxClusterRank(const Function &F) { return getFnAttrParsedInt(F, "nvvm.maxclusterrank"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index a1b4a0e5e7471..e792e441e49e6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -54,6 +54,7 @@ SmallVector getClusterDim(const Function &); std::optional getOverallMaxNTID(const Function &); std::optional getOverallReqNTID(const Function &); +std::optional getOverallClusterRank(const Function &); std::optional getMaxClusterRank(const Function &); std::optional getMinCTASm(const Function &); diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp index 91b8e470e055e..2c81989932a97 100644 --- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -58,87 +58,89 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) { } static bool runNVVMIntrRange(Function &F) { - struct { - unsigned x, y, z; - } MaxBlockSize, MaxGridSize; + struct Vector3 { + unsigned X, Y, Z; + }; - const unsigned MetadataNTID = getOverallReqNTID(F).value_or( - getOverallMaxNTID(F).value_or(std::numeric_limits::max())); + // All these annotations are only valid for kernel functions. + if (!isKernelFunction(F)) + return false; - MaxBlockSize.x = std::min(1024u, MetadataNTID); - MaxBlockSize.y = std::min(1024u, MetadataNTID); - MaxBlockSize.z = std::min(64u, MetadataNTID); + const auto OverallReqNTID = getOverallReqNTID(F); + const auto OverallMaxNTID = getOverallMaxNTID(F); + const auto OverallClusterRank = getOverallClusterRank(F); - MaxGridSize.x = 0x7fffffff; - MaxGridSize.y = 0xffff; - MaxGridSize.z = 0xffff; + // If this function lacks any range information, do nothing. + if (!(OverallReqNTID || OverallMaxNTID || OverallClusterRank)) + return false; - // Go through the calls in this function. - bool Changed = false; - for (Instruction &I : instructions(F)) { - IntrinsicInst *II = dyn_cast(&I); - if (!II) - continue; + const unsigned FunctionNTID = OverallReqNTID.value_or( + OverallMaxNTID.value_or(std::numeric_limits::max())); + const unsigned FunctionClusterRank = + OverallClusterRank.value_or(std::numeric_limits::max()); + + const Vector3 MaxBlockSize{std::min(1024u, FunctionNTID), + std::min(1024u, FunctionNTID), + std::min(64u, FunctionNTID)}; + + // We conservatively use the maximum grid size as an upper bound for the + // cluster rank. + const Vector3 MaxClusterRank{std::min(0x7fffffffu, FunctionClusterRank), + std::min(0xffffu, FunctionClusterRank), + std::min(0xffffu, FunctionClusterRank)}; + + const auto ProccessIntrinsic = [&](IntrinsicInst *II) -> bool { switch (II->getIntrinsicID()) { // Index within block case Intrinsic::nvvm_read_ptx_sreg_tid_x: - Changed |= addRangeAttr(0, MaxBlockSize.x, II); - break; + return addRangeAttr(0, MaxBlockSize.X, II); case Intrinsic::nvvm_read_ptx_sreg_tid_y: - Changed |= addRangeAttr(0, MaxBlockSize.y, II); - break; + return addRangeAttr(0, MaxBlockSize.Y, II); case Intrinsic::nvvm_read_ptx_sreg_tid_z: - Changed |= addRangeAttr(0, MaxBlockSize.z, II); - break; + return addRangeAttr(0, MaxBlockSize.Z, II); // Block size case Intrinsic::nvvm_read_ptx_sreg_ntid_x: - Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II); - break; + return addRangeAttr(1, MaxBlockSize.X + 1, II); case Intrinsic::nvvm_read_ptx_sreg_ntid_y: - Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II); - break; + return addRangeAttr(1, MaxBlockSize.Y + 1, II); case Intrinsic::nvvm_read_ptx_sreg_ntid_z: - Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II); - break; - - // Index within grid - case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: - Changed |= addRangeAttr(0, MaxGridSize.x, II); - break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: - Changed |= addRangeAttr(0, MaxGridSize.y, II); + return addRangeAttr(1, MaxBlockSize.Z + 1, II); + + // Cluster size + case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x: + return addRangeAttr(0, MaxClusterRank.X, II); + case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y: + return addRangeAttr(0, MaxClusterRank.Y, II); + case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z: + return addRangeAttr(0, MaxClusterRank.Z, II); + case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x: + return addRangeAttr(1, MaxClusterRank.X + 1, II); + case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y: + return addRangeAttr(1, MaxClusterRank.Y + 1, II); + case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z: + return addRangeAttr(1, MaxClusterRank.Z + 1, II); + + case Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank: + if (OverallClusterRank) + return addRangeAttr(0, FunctionClusterRank, II); break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: - Changed |= addRangeAttr(0, MaxGridSize.z, II); + case Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank: + if (OverallClusterRank) + return addRangeAttr(1, FunctionClusterRank + 1, II); break; - - // Grid size - case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: - Changed |= addRangeAttr(1, MaxGridSize.x + 1, II); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: - Changed |= addRangeAttr(1, MaxGridSize.y + 1, II); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: - Changed |= addRangeAttr(1, MaxGridSize.z + 1, II); - break; - - // warp size is constant 32. - case Intrinsic::nvvm_read_ptx_sreg_warpsize: - Changed |= addRangeAttr(32, 32 + 1, II); - break; - - // Lane ID is [0..warpsize) - case Intrinsic::nvvm_read_ptx_sreg_laneid: - Changed |= addRangeAttr(0, 32, II); - break; - default: - break; + return false; } - } + return false; + }; + + // Go through the calls in this function. + bool Changed = false; + for (Instruction &I : instructions(F)) + if (IntrinsicInst *II = dyn_cast(&I)) + Changed |= ProccessIntrinsic(II); return Changed; } diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll index 884a4b1a3584f..48fa3e06629b4 100644 --- a/llvm/test/CodeGen/NVPTX/intr-range.ll +++ b/llvm/test/CodeGen/NVPTX/intr-range.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s +; RUN: llvm-as < %s | llvm-dis | FileCheck %s --check-prefix=DEFAULT define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" { ; CHECK-LABEL: define ptx_kernel i32 @test_maxntid( @@ -74,10 +75,149 @@ define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" { ret i32 %1 } +define ptx_kernel i32 @test_cluster_ctaid() "nvvm.maxclusterrank"="8" { +; CHECK-LABEL: define ptx_kernel i32 @test_cluster_ctaid( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x() +; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y() +; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z() +; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x() +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z() +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]] +; CHECK-NEXT: ret i32 [[TMP15]] +; + %1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x() + %2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z() + %4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x() + %5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y() + %6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z() + %7 = add i32 %1, %2 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + %10 = add i32 %9, %5 + %11 = add i32 %10, %6 + ret i32 %11 +} + +define ptx_kernel i32 @test_cluster_dim() "nvvm.cluster_dim"="4,4,1" { +; CHECK-LABEL: define ptx_kernel i32 @test_cluster_dim( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x() +; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y() +; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z() +; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x() +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z() +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]] +; CHECK-NEXT: ret i32 [[TMP15]] +; + %1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x() + %2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z() + %4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x() + %5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y() + %6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z() + %7 = add i32 %1, %2 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + %10 = add i32 %9, %5 + %11 = add i32 %10, %6 + ret i32 %11 +} + + +; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y() +; DEFAULT-DAG: declare noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.tid.w() + +; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +; DEFAULT-DAG: declare noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ntid.w() + +; DEFAULT-DAG: declare noundef range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid() +; DEFAULT-DAG: declare noundef range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize() + +; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() +; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ctaid.w() + +; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() +; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() +; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nctaid.w() + +; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.clusterid.x() +; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.y() +; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.clusterid.w() + +; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x() +; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y() +; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w() + +; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x() +; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y() +; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w() + +; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x() +; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y() +; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z() +; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w() + declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.w() declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.w() + +declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.w() + +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.w() + +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.w() + +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w() + +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w() + +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w() diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll index 72c4cda16db47..f595df837f91f 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -1,13 +1,10 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s -; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_device i32 @test_tid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x; -; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ret i32 %x @@ -15,7 +12,6 @@ define ptx_device i32 @test_tid_x() { define ptx_device i32 @test_tid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y; -; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() ret i32 %x @@ -23,7 +19,6 @@ define ptx_device i32 @test_tid_y() { define ptx_device i32 @test_tid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z; -; RANGE: call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() ret i32 %x @@ -38,7 +33,6 @@ define ptx_device i32 @test_tid_w() { define ptx_device i32 @test_ntid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x; -; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ret i32 %x @@ -46,7 +40,6 @@ define ptx_device i32 @test_ntid_x() { define ptx_device i32 @test_ntid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y; -; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() ret i32 %x @@ -54,7 +47,6 @@ define ptx_device i32 @test_ntid_y() { define ptx_device i32 @test_ntid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z; -; RANGE: call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() ret i32 %x @@ -69,7 +61,6 @@ define ptx_device i32 @test_ntid_w() { define ptx_device i32 @test_laneid() { ; CHECK: mov.u32 %r{{[0-9]+}}, %laneid; -; RANGE: call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.laneid() ret i32 %x @@ -77,7 +68,6 @@ define ptx_device i32 @test_laneid() { define ptx_device i32 @test_warpsize() { ; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ; -; RANGE: call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() ret i32 %x @@ -99,7 +89,6 @@ define ptx_device i32 @test_nwarpid() { define ptx_device i32 @test_ctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y; -; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() ret i32 %x @@ -107,7 +96,6 @@ define ptx_device i32 @test_ctaid_y() { define ptx_device i32 @test_ctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z; -; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() ret i32 %x @@ -115,7 +103,6 @@ define ptx_device i32 @test_ctaid_z() { define ptx_device i32 @test_ctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; -; RANGE: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ret i32 %x @@ -130,7 +117,6 @@ define ptx_device i32 @test_ctaid_w() { define ptx_device i32 @test_nctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y; -; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() ret i32 %x @@ -138,7 +124,6 @@ define ptx_device i32 @test_nctaid_y() { define ptx_device i32 @test_nctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z; -; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ret i32 %x @@ -146,7 +131,6 @@ define ptx_device i32 @test_nctaid_z() { define ptx_device i32 @test_nctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; -; RANGE: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ret i32 %x @@ -154,7 +138,7 @@ define ptx_device i32 @test_nctaid_x() { define ptx_device i32 @test_already_has_range_md() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[ALREADY:[0-9]+]] +; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range !0 ret i32 %x } @@ -316,4 +300,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.pm3() declare void @llvm.nvvm.bar.sync(i32 %i) !0 = !{i32 0, i32 19} -; RANGE-DAG: ![[ALREADY]] = !{i32 0, i32 19}