From ef44a59a21025e9f3920593570a9ed10727506ee Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Fri, 9 May 2025 17:55:00 +0000 Subject: [PATCH] [NVPTX] Add intrinsics for the bmsk instruction --- llvm/docs/NVPTXUsage.rst | 26 ++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 10 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 12 ++++ llvm/test/CodeGen/NVPTX/bmsk.ll | 77 ++++++++++++++++++++++++ 4 files changed, 125 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/bmsk.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index b6222300e4d4a..51bbfd0a5c88d 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then zero-extended the case of the '``zext``' variants, or sign-extended the case of the '``sext``' variants. If N is 0, the result is 0. +'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b) + declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b) + +Overview: +""""""""" + +The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask +given a starting bit position and a bit width. + +Semantics: +"""""""""" + +The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with +all bits set to 0 except for %b bits starting at bit position %a. For the +'``wrap``' variants, the values of %a and %b modulo 32 are used. For the +'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32], +which in practice is equivalent to using them as is. + TMA family of Intrinsics ------------------------ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 2851206f2e84a..640fdf3f86326 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in { [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; + +// +// BMSK - bit mask +// + foreach mode = ["wrap", "clamp"] in + def int_nvvm_bmsk_ # mode : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + // // Convert // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 7b139d7b79e7d..c339817a2d214 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in { } } +// +// BMSK +// + +foreach mode = ["wrap", "clamp"] in { + defvar intrin = !cast("int_nvvm_bmsk_" # mode); + defm BMSK_ # mode + : I3Inst<"bmsk." # mode # ".b32", + intrin, I32RT, commutative = false, + requires = [hasSM<70>, hasPTX<76>]>; +} + // // Convert // diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll new file mode 100644 index 0000000000000..ead4a42bc6c81 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/bmsk.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} + +target triple = "nvptx64-unknown-cuda" + +define i32 @bmsk_wrap(i32 %a, i32 %b) { +; CHECK-LABEL: bmsk_wrap( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [bmsk_wrap_param_1]; +; CHECK-NEXT: bmsk.wrap.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @bmsk_clamp(i32 %a, i32 %b) { +; CHECK-LABEL: bmsk_clamp( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [bmsk_clamp_param_1]; +; CHECK-NEXT: bmsk.clamp.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @bmsk_wrap_ii() { +; CHECK-LABEL: bmsk_wrap_ii( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b32 %r1, 5; +; CHECK-NEXT: bmsk.wrap.b32 %r2, %r1, 6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6) + ret i32 %c +} + +define i32 @bmsk_clamp_ir(i32 %a) { +; CHECK-LABEL: bmsk_clamp_ir( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_ir_param_0]; +; CHECK-NEXT: bmsk.clamp.b32 %r2, %r1, 7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7) + ret i32 %c +} + +define i32 @bmsk_wrap_ri(i32 %a) { +; CHECK-LABEL: bmsk_wrap_ri( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_ri_param_0]; +; CHECK-NEXT: bmsk.wrap.b32 %r2, 5, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a) + ret i32 %c +}