Skip to content

Conversation

@AlexMaclean
Copy link
Member

No description provided.

@llvmbot
Copy link
Member

llvmbot commented May 9, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/139299.diff

4 Files Affected:

  • (modified) llvm/docs/NVPTXUsage.rst (+26)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+10)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+12)
  • (added) llvm/test/CodeGen/NVPTX/bmsk.ll (+77)
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b6222300e4d4a..51bbfd0a5c88d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
 zero-extended the case of the '``zext``' variants, or sign-extended the case of
 the '``sext``' variants. If N is 0, the result is 0.
 
+'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
+given a starting bit position and a bit width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
+all bits set to 0 except for %b bits starting at bit position %a. For the
+'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
+'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
+which in practice is equivalent to using them as is.
+
 TMA family of Intrinsics
 ------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2851206f2e84a..640fdf3f86326 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
           [llvm_i32_ty, llvm_i32_ty],
           [IntrNoMem, IntrSpeculatable]>;
 
+
+//
+// BMSK - bit mask
+//
+  foreach mode = ["wrap", "clamp"] in
+    def int_nvvm_bmsk_ # mode :
+      DefaultAttrsIntrinsic<[llvm_i32_ty],
+        [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable]>;
+
 //
 // Convert
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b139d7b79e7d..c339817a2d214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
   }
 }
 
+//
+// BMSK
+//
+
+foreach mode = ["wrap", "clamp"] in {
+  defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
+  defm BMSK_ # mode
+    : I3Inst<"bmsk." # mode # ".b32",
+             intrin, I32RT, commutative = false,
+             requires = [hasSM<70>, hasPTX<76>]>;
+}
+
 //
 // Convert
 //
diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll
new file mode 100644
index 0000000000000..ead4a42bc6c81
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bmsk.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @bmsk_wrap(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_wrap(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_wrap_param_1];
+; CHECK-NEXT:    bmsk.wrap.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_clamp(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_clamp_param_1];
+; CHECK-NEXT:    bmsk.clamp.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ii() {
+; CHECK-LABEL: bmsk_wrap_ii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 5;
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, %r1, 6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp_ir(i32 %a) {
+; CHECK-LABEL: bmsk_clamp_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
+; CHECK-NEXT:    bmsk.clamp.b32 %r2, %r1, 7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ri(i32 %a) {
+; CHECK-LABEL: bmsk_wrap_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, 5, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
+  ret i32 %c
+}

@llvmbot
Copy link
Member

llvmbot commented May 9, 2025

@llvm/pr-subscribers-llvm-ir

Author: Alex MacLean (AlexMaclean)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/139299.diff

4 Files Affected:

  • (modified) llvm/docs/NVPTXUsage.rst (+26)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+10)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+12)
  • (added) llvm/test/CodeGen/NVPTX/bmsk.ll (+77)
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b6222300e4d4a..51bbfd0a5c88d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
 zero-extended the case of the '``zext``' variants, or sign-extended the case of
 the '``sext``' variants. If N is 0, the result is 0.
 
+'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
+given a starting bit position and a bit width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
+all bits set to 0 except for %b bits starting at bit position %a. For the
+'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
+'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
+which in practice is equivalent to using them as is.
+
 TMA family of Intrinsics
 ------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2851206f2e84a..640fdf3f86326 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
           [llvm_i32_ty, llvm_i32_ty],
           [IntrNoMem, IntrSpeculatable]>;
 
+
+//
+// BMSK - bit mask
+//
+  foreach mode = ["wrap", "clamp"] in
+    def int_nvvm_bmsk_ # mode :
+      DefaultAttrsIntrinsic<[llvm_i32_ty],
+        [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable]>;
+
 //
 // Convert
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b139d7b79e7d..c339817a2d214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
   }
 }
 
+//
+// BMSK
+//
+
+foreach mode = ["wrap", "clamp"] in {
+  defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
+  defm BMSK_ # mode
+    : I3Inst<"bmsk." # mode # ".b32",
+             intrin, I32RT, commutative = false,
+             requires = [hasSM<70>, hasPTX<76>]>;
+}
+
 //
 // Convert
 //
diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll
new file mode 100644
index 0000000000000..ead4a42bc6c81
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bmsk.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @bmsk_wrap(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_wrap(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_wrap_param_1];
+; CHECK-NEXT:    bmsk.wrap.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_clamp(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_clamp_param_1];
+; CHECK-NEXT:    bmsk.clamp.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ii() {
+; CHECK-LABEL: bmsk_wrap_ii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 5;
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, %r1, 6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp_ir(i32 %a) {
+; CHECK-LABEL: bmsk_clamp_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
+; CHECK-NEXT:    bmsk.clamp.b32 %r2, %r1, 7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ri(i32 %a) {
+; CHECK-LABEL: bmsk_wrap_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, 5, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
+  ret i32 %c
+}

Copy link
Member

@Artem-B Artem-B left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@AlexMaclean AlexMaclean merged commit 4f70917 into llvm:main May 9, 2025
13 of 15 checks passed
all bits set to 0 except for %b bits starting at bit position %a. For the
'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
which in practice is equivalent to using them as is.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM,

We could add a link to the corresponding page in the PTX ISA.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants