From e4e6b697b205c402b1f21f6a8080c7c5c245bca3 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Thu, 8 May 2025 15:54:57 +0000
Subject: [PATCH 1/3] [NVPTX] Add intrinsics for the szext instruction

---
 llvm/docs/NVPTXUsage.rst                 | 93 ++++++++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td   | 11 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  | 74 +++++++++----------
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 15 ++++
 llvm/test/CodeGen/NVPTX/szext.ll         | 65 +++++++++++++++++
 5 files changed, 217 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/szext.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index c1426823d87af..331a4b8e08883 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -568,6 +568,99 @@ to left-shift the found bit into the most-significant bit position, otherwise
 the result is the shift amount needed to right-shift the found bit into the
 least-significant bit position. 0xffffffff is returned if no 1 bit is found.
 
+'``llvm.nvvm.zext.inreg.clamp``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.zext.inreg.clamp``' intrinsic extracts the low bits of the
+input value, and zero-extends them back to the original width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.zext.inreg.clamp``' returns the zero-extension of N lowest bits
+of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N
+is 0, the result is 0.
+
+'``llvm.nvvm.zext.inreg.wrap``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.zext.inreg.wrap``' intrinsic extracts the low bits of the
+input value, and zero-extends them back to the original width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.zext.inreg.wrap``' returns the zero-extension of N lowest bits
+of operand %a. N is the value of operand %b modulo 32. If N is 0, the result
+is 0.
+
+'``llvm.nvvm.sext.inreg.clamp``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.sext.inreg.clamp``' intrinsic extracts the low bits of the
+input value, and sign-extends them back to the original width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.sext.inreg.clamp``' returns the sign-extension of N lowest bits
+of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N
+is 0, the result is 0.
+
+
+'``llvm.nvvm.sext.inreg.wrap``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.sext.inreg.wrap``' intrinsic extracts the low bits of the
+input value, and sign-extends them back to the original width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.sext.inreg.wrap``' returns the sign-extension of N lowest bits
+of operand %a. N is the value of operand %b modulo 32. If N is 0, the result
+is 0.
+
 TMA family of Intrinsics
 ------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 8b87822d3fdda..65f0e2209fc6b 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1356,6 +1356,17 @@ let TargetPrefix = "nvvm" in {
         [llvm_anyint_ty, llvm_i1_ty],
         [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
 
+
+//
+// szext
+//
+  foreach ext = ["sext", "zext"] in
+    foreach mode = ["wrap", "clamp"] in
+      def int_nvvm_ # ext # _inreg_ # mode :
+        DefaultAttrsIntrinsic<[llvm_i32_ty],
+          [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrSpeculatable]>;
+
 //
 // Convert
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 11d77599d4ac3..dae6c929eea9e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -240,26 +240,33 @@ def F16X2RT  : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
 def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
 
 
+multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
+                  bit commutative, list<Predicate> requires = []> {
+  defvar asmstr = op_str # " \t$dst, $a, $b;";
+
+  def rr :
+    NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
+              asmstr,
+              [(set t.Ty:$dst, (op_node t.Ty:$a, t.Ty:$b))]>,
+              Requires<requires>;
+  def ri :
+    NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
+              asmstr,
+              [(set t.Ty:$dst, (op_node t.RC:$a, imm:$b))]>,
+              Requires<requires>;
+  if !not(commutative) then
+    def ir :
+      NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
+                asmstr,
+                [(set t.Ty:$dst, (op_node imm:$a, t.RC:$b))]>,
+                Requires<requires>;
+}
+
 // Template for instructions which take three int64, int32, or int16 args.
 // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
-multiclass I3<string OpcStr, SDNode OpNode, bit commutative> {
-  foreach t = [I16RT, I32RT, I64RT] in {
-    defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;";
-
-    def t.Ty # rr :
-      NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
-                asmstr,
-                [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>;
-    def t.Ty # ri :
-      NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
-                asmstr,
-                [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>;
-    if !not(commutative) then
-      def t.Ty # ir :
-        NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
-                  asmstr,
-                  [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>;
-  }
+multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> {
+  foreach t = [I16RT, I32RT, I64RT] in
+    defm t.Ty# : I3Inst<op_str # t.Size, op_node, t, commutative>;
 }
 
 class I16x2<string OpcStr, SDNode OpNode> :
@@ -270,26 +277,11 @@ class I16x2<string OpcStr, SDNode OpNode> :
 
 // Template for instructions which take 3 int args.  The instructions are
 // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
-multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
+multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> {
   let hasSideEffects = 1 in {
-    def i32rr :
-      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-                [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
-    def i32ri :
-      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-                [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
-    def i64rr :
-      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
-                [(set i64:$dst, (OpNode i64:$a, i64:$b))]>,
-      Requires<[hasPTX<43>]>;
-    def i64ri :
-      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
-                [(set i64:$dst, (OpNode i64:$a, imm:$b))]>,
-      Requires<[hasPTX<43>]>;
+    defm i32 : I3Inst<op_str # ".s32", op_node, I32RT, commutative>;
+    defm i64 : I3Inst<op_str # ".s64", op_node, I64RT, commutative,
+                     requires = [hasPTX<43>]>;
   }
 }
 
@@ -847,12 +839,12 @@ defm SUB : I3<"sub.s", sub, /*commutative=*/ false>;
 def ADD16x2 : I16x2<"add.s", add>;
 
 // in32 and int64 addition and subtraction with carry-out.
-defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
-defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
+defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc, commutative = true>;
+defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>;
 
 // int32 and int64 addition and subtraction with carry-in and carry-out.
-defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
-defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
+defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>;
+defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>;
 
 defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 3eedb43e4c81a..12c886cb0ca4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1678,6 +1678,21 @@ foreach t = [I32RT, I64RT] in {
   }
 }
 
+//
+// szext
+//
+
+foreach sign = ["s", "u"] in {
+  foreach mode = ["wrap", "clamp"] in {
+    defvar ext = !if(!eq(sign, "s"), "sext", "zext");
+    defvar intrin = !cast<Intrinsic>("int_nvvm_" # ext # "_inreg_" # mode);
+    defm SZEXT_ # sign # _ # mode
+      : I3Inst<"szext." # mode # "." # sign # "32",
+               intrin, I32RT, commutative = false,
+               requires = [hasSM<70>, hasPTX<76>]>;
+  }
+}
+
 //
 // Convert
 //
diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll
new file mode 100644
index 0000000000000..a86c06c24ed98
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/szext.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+define i32 @szext_wrap_u32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_wrap_u32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_wrap_u32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_wrap_u32_param_1];
+; CHECK-NEXT:    szext.wrap.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @szext_clamp_u32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_clamp_u32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_clamp_u32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_clamp_u32_param_1];
+; CHECK-NEXT:    szext.clamp.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @szext_wrap_s32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_wrap_s32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_wrap_s32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_wrap_s32_param_1];
+; CHECK-NEXT:    szext.wrap.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @szext_clamp_s32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_clamp_s32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_clamp_s32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_clamp_s32_param_1];
+; CHECK-NEXT:    szext.clamp.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+

From abf18acf2d4c246a9d08b3c476323cc4f72647ab Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Thu, 8 May 2025 21:08:10 +0000
Subject: [PATCH 2/3] address comments

---
 llvm/docs/NVPTXUsage.rst                 | 93 ++++--------------------
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  2 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  | 67 +++++------------
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  2 +-
 llvm/test/CodeGen/NVPTX/i128.ll          | 92 +++++++++++------------
 llvm/test/CodeGen/NVPTX/szext.ll         | 49 ++++++++++++-
 6 files changed, 128 insertions(+), 177 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 331a4b8e08883..b6222300e4d4a 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -568,98 +568,35 @@ to left-shift the found bit into the most-significant bit position, otherwise
 the result is the shift amount needed to right-shift the found bit into the
 least-significant bit position. 0xffffffff is returned if no 1 bit is found.
 
-'``llvm.nvvm.zext.inreg.clamp``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-.. code-block:: llvm
-
-    declare i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b)
-
-Overview:
-"""""""""
-
-The '``llvm.nvvm.zext.inreg.clamp``' intrinsic extracts the low bits of the
-input value, and zero-extends them back to the original width.
-
-Semantics:
-""""""""""
-
-The '``llvm.nvvm.zext.inreg.clamp``' returns the zero-extension of N lowest bits
-of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N
-is 0, the result is 0.
-
-'``llvm.nvvm.zext.inreg.wrap``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-.. code-block:: llvm
-
-    declare i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b)
-
-Overview:
-"""""""""
-
-The '``llvm.nvvm.zext.inreg.wrap``' intrinsic extracts the low bits of the
-input value, and zero-extends them back to the original width.
-
-Semantics:
-""""""""""
-
-The '``llvm.nvvm.zext.inreg.wrap``' returns the zero-extension of N lowest bits
-of operand %a. N is the value of operand %b modulo 32. If N is 0, the result
-is 0.
-
-'``llvm.nvvm.sext.inreg.clamp``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-.. code-block:: llvm
-
-    declare i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b)
-
-Overview:
-"""""""""
-
-The '``llvm.nvvm.sext.inreg.clamp``' intrinsic extracts the low bits of the
-input value, and sign-extends them back to the original width.
-
-Semantics:
-""""""""""
-
-The '``llvm.nvvm.sext.inreg.clamp``' returns the sign-extension of N lowest bits
-of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N
-is 0, the result is 0.
-
-
-'``llvm.nvvm.sext.inreg.wrap``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.nvvm.{zext,sext}.{wrap,clamp}``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 .. code-block:: llvm
 
-    declare i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b)
 
 Overview:
 """""""""
 
-The '``llvm.nvvm.sext.inreg.wrap``' intrinsic extracts the low bits of the
-input value, and sign-extends them back to the original width.
+The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics extracts the
+low bits of the input value, and zero- or sign-extends them back to the original
+width.
 
 Semantics:
 """"""""""
 
-The '``llvm.nvvm.sext.inreg.wrap``' returns the sign-extension of N lowest bits
-of operand %a. N is the value of operand %b modulo 32. If N is 0, the result
-is 0.
+The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics returns
+extension of N lowest bits of operand %a. For the '``wrap``' variants, N is the
+value of operand %b modulo 32. For the '``clamp``' variants, N is the value of
+operand %b clamped to the range [0, 32]. The N lowest bits are then
+zero-extended the case of the '``zext``' variants, or sign-extended the case of
+the '``sext``' variants. If N is 0, the result is 0.
 
 TMA family of Intrinsics
 ------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 65f0e2209fc6b..2851206f2e84a 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1362,7 +1362,7 @@ let TargetPrefix = "nvvm" in {
 //
   foreach ext = ["sext", "zext"] in
     foreach mode = ["wrap", "clamp"] in
-      def int_nvvm_ # ext # _inreg_ # mode :
+      def int_nvvm_ # ext # _ # mode :
         DefaultAttrsIntrinsic<[llvm_i32_ty],
           [llvm_i32_ty, llvm_i32_ty],
           [IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dae6c929eea9e..a384cb79d645a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -227,6 +227,7 @@ class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node,
   int Size = ty.Size;
 }
 
+def I1RT     : RegTyInfo<i1,  Int1Regs,  i1imm,  imm>;
 def I16RT    : RegTyInfo<i16, Int16Regs, i16imm, imm>;
 def I32RT    : RegTyInfo<i32, Int32Regs, i32imm, imm>;
 def I64RT    : RegTyInfo<i64, Int64Regs, i64imm, imm>;
@@ -252,13 +253,13 @@ multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
   def ri :
     NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
               asmstr,
-              [(set t.Ty:$dst, (op_node t.RC:$a, imm:$b))]>,
+              [(set t.Ty:$dst, (op_node t.Ty:$a, (t.Ty imm:$b)))]>,
               Requires<requires>;
   if !not(commutative) then
     def ir :
       NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
                 asmstr,
-                [(set t.Ty:$dst, (op_node imm:$a, t.RC:$b))]>,
+                [(set t.Ty:$dst, (op_node (t.Ty imm:$a), t.Ty:$b))]>,
                 Requires<requires>;
 }
 
@@ -833,8 +834,8 @@ defm SUB_i1 : ADD_SUB_i1<sub>;
 
 // int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
 // also use these for unsigned arithmetic.
-defm ADD : I3<"add.s", add, /*commutative=*/ true>;
-defm SUB : I3<"sub.s", sub, /*commutative=*/ false>;
+defm ADD : I3<"add.s", add, commutative = true>;
+defm SUB : I3<"sub.s", sub, commutative = false>;
 
 def ADD16x2 : I16x2<"add.s", add>;
 
@@ -846,18 +847,18 @@ defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>;
 defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>;
 defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>;
 
-defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>;
+defm MULT : I3<"mul.lo.s", mul, commutative = true>;
 
-defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>;
-defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>;
+defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>;
+defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>;
 
-defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>;
-defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>;
+defm SDIV : I3<"div.s", sdiv, commutative = false>;
+defm UDIV : I3<"div.u", udiv, commutative = false>;
 
 // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
 // will lower it.
-defm SREM : I3<"rem.s", srem, /*commutative=*/ false>;
-defm UREM : I3<"rem.u", urem, /*commutative=*/ false>;
+defm SREM : I3<"rem.s", srem, commutative = false>;
+defm UREM : I3<"rem.u", urem, commutative = false>;
 
 // Integer absolute value.  NumBits should be one minus the bit width of RC.
 // This idiom implements the algorithm at
@@ -872,10 +873,10 @@ defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
 defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
 
 // Integer min/max.
-defm SMAX : I3<"max.s", smax, /*commutative=*/ true>;
-defm UMAX : I3<"max.u", umax, /*commutative=*/ true>;
-defm SMIN : I3<"min.s", smin, /*commutative=*/ true>;
-defm UMIN : I3<"min.u", umin, /*commutative=*/ true>;
+defm SMAX : I3<"max.s", smax, commutative = true>;
+defm UMAX : I3<"max.u", umax, commutative = true>;
+defm SMIN : I3<"min.s", smin, commutative = true>;
+defm UMIN : I3<"min.u", umin, commutative = true>;
 
 def SMAX16x2 : I16x2<"max.s", smax>;
 def UMAX16x2 : I16x2<"max.u", umax>;
@@ -1385,38 +1386,10 @@ def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
 // Template for three-arg bitwise operations.  Takes three args, Creates .b16,
 // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
 multiclass BITWISE<string OpcStr, SDNode OpNode> {
-  def b1rr :
-    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
-              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
-              [(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
-  def b1ri :
-    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
-              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
-              [(set i1:$dst, (OpNode i1:$a, imm:$b))]>;
-  def b16rr :
-    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-              [(set i16:$dst, (OpNode i16:$a, i16:$b))]>;
-  def b16ri :
-    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-              [(set i16:$dst, (OpNode i16:$a, imm:$b))]>;
-  def b32rr :
-    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
-              [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
-  def b32ri :
-    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
-              [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
-  def b64rr :
-    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
-              [(set i64:$dst, (OpNode i64:$a, i64:$b))]>;
-  def b64ri :
-    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
-              [(set i64:$dst, (OpNode i64:$a, imm:$b))]>;
+  defm b1 : I3Inst<OpcStr # ".pred", OpNode, I1RT, commutative = true>;
+  defm b16 : I3Inst<OpcStr # ".b16", OpNode, I16RT, commutative = true>;
+  defm b32 : I3Inst<OpcStr # ".b32", OpNode, I32RT, commutative = true>;
+  defm b64 : I3Inst<OpcStr # ".b64", OpNode, I64RT, commutative = true>;
 }
 
 defm OR  : BITWISE<"or", or>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 12c886cb0ca4e..7b139d7b79e7d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1685,7 +1685,7 @@ foreach t = [I32RT, I64RT] in {
 foreach sign = ["s", "u"] in {
   foreach mode = ["wrap", "clamp"] in {
     defvar ext = !if(!eq(sign, "s"), "sext", "zext");
-    defvar intrin = !cast<Intrinsic>("int_nvvm_" # ext # "_inreg_" # mode);
+    defvar intrin = !cast<Intrinsic>("int_nvvm_" # ext # "_" # mode);
     defm SZEXT_ # sign # _ # mode
       : I3Inst<"szext." # mode # "." # sign # "32",
                intrin, I32RT, commutative = false,
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 64786e601c4b5..f1ca19b30ac2a 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -13,37 +13,37 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd46, 63;
-; CHECK-NEXT:    mov.b64 %rd117, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd52, %rd117, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd53, %rd117, %rd46;
+; CHECK-NEXT:    sub.cc.s64 %rd51, 0, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd52, 0, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
-; CHECK-NEXT:    selp.b64 %rd4, %rd53, %rd46, %p1;
-; CHECK-NEXT:    selp.b64 %rd3, %rd52, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd54, %rd117, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd55, %rd117, %rd50;
+; CHECK-NEXT:    selp.b64 %rd4, %rd52, %rd46, %p1;
+; CHECK-NEXT:    selp.b64 %rd3, %rd51, %rd45, %p1;
+; CHECK-NEXT:    sub.cc.s64 %rd53, 0, %rd49;
+; CHECK-NEXT:    subc.cc.s64 %rd54, 0, %rd50;
 ; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
-; CHECK-NEXT:    selp.b64 %rd6, %rd55, %rd50, %p2;
-; CHECK-NEXT:    selp.b64 %rd5, %rd54, %rd49, %p2;
-; CHECK-NEXT:    or.b64 %rd56, %rd5, %rd6;
-; CHECK-NEXT:    setp.eq.s64 %p3, %rd56, 0;
-; CHECK-NEXT:    or.b64 %rd57, %rd3, %rd4;
-; CHECK-NEXT:    setp.eq.s64 %p4, %rd57, 0;
+; CHECK-NEXT:    selp.b64 %rd6, %rd54, %rd50, %p2;
+; CHECK-NEXT:    selp.b64 %rd5, %rd53, %rd49, %p2;
+; CHECK-NEXT:    or.b64 %rd55, %rd5, %rd6;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd55, 0;
+; CHECK-NEXT:    or.b64 %rd56, %rd3, %rd4;
+; CHECK-NEXT:    setp.eq.s64 %p4, %rd56, 0;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
 ; CHECK-NEXT:    setp.ne.s64 %p6, %rd6, 0;
 ; CHECK-NEXT:    clz.b64 %r1, %rd6;
-; CHECK-NEXT:    cvt.u64.u32 %rd58, %r1;
+; CHECK-NEXT:    cvt.u64.u32 %rd57, %r1;
 ; CHECK-NEXT:    clz.b64 %r2, %rd5;
-; CHECK-NEXT:    cvt.u64.u32 %rd59, %r2;
-; CHECK-NEXT:    add.s64 %rd60, %rd59, 64;
-; CHECK-NEXT:    selp.b64 %rd61, %rd58, %rd60, %p6;
+; CHECK-NEXT:    cvt.u64.u32 %rd58, %r2;
+; CHECK-NEXT:    add.s64 %rd59, %rd58, 64;
+; CHECK-NEXT:    selp.b64 %rd60, %rd57, %rd59, %p6;
 ; CHECK-NEXT:    setp.ne.s64 %p7, %rd4, 0;
 ; CHECK-NEXT:    clz.b64 %r3, %rd4;
-; CHECK-NEXT:    cvt.u64.u32 %rd62, %r3;
+; CHECK-NEXT:    cvt.u64.u32 %rd61, %r3;
 ; CHECK-NEXT:    clz.b64 %r4, %rd3;
-; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
-; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
-; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    sub.cc.s64 %rd66, %rd61, %rd65;
+; CHECK-NEXT:    cvt.u64.u32 %rd62, %r4;
+; CHECK-NEXT:    add.s64 %rd63, %rd62, 64;
+; CHECK-NEXT:    selp.b64 %rd64, %rd61, %rd63, %p7;
+; CHECK-NEXT:    mov.b64 %rd117, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd66, %rd60, %rd64;
 ; CHECK-NEXT:    subc.cc.s64 %rd67, %rd117, 0;
 ; CHECK-NEXT:    setp.gt.u64 %p8, %rd66, 127;
 ; CHECK-NEXT:    setp.eq.s64 %p9, %rd67, 0;
@@ -314,39 +314,39 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
-; CHECK-NEXT:    mov.b64 %rd112, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd52, %rd112, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd53, %rd112, %rd46;
+; CHECK-NEXT:    sub.cc.s64 %rd51, 0, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd52, 0, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
-; CHECK-NEXT:    selp.b64 %rd2, %rd53, %rd46, %p1;
-; CHECK-NEXT:    selp.b64 %rd1, %rd52, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd54, %rd112, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd55, %rd112, %rd50;
+; CHECK-NEXT:    selp.b64 %rd2, %rd52, %rd46, %p1;
+; CHECK-NEXT:    selp.b64 %rd1, %rd51, %rd45, %p1;
+; CHECK-NEXT:    sub.cc.s64 %rd53, 0, %rd49;
+; CHECK-NEXT:    subc.cc.s64 %rd54, 0, %rd50;
 ; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
-; CHECK-NEXT:    selp.b64 %rd4, %rd55, %rd50, %p2;
-; CHECK-NEXT:    selp.b64 %rd3, %rd54, %rd49, %p2;
-; CHECK-NEXT:    xor.b64 %rd56, %rd50, %rd46;
-; CHECK-NEXT:    shr.s64 %rd5, %rd56, 63;
-; CHECK-NEXT:    or.b64 %rd57, %rd3, %rd4;
-; CHECK-NEXT:    setp.eq.s64 %p3, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd58, %rd1, %rd2;
-; CHECK-NEXT:    setp.eq.s64 %p4, %rd58, 0;
+; CHECK-NEXT:    selp.b64 %rd4, %rd54, %rd50, %p2;
+; CHECK-NEXT:    selp.b64 %rd3, %rd53, %rd49, %p2;
+; CHECK-NEXT:    xor.b64 %rd55, %rd50, %rd46;
+; CHECK-NEXT:    shr.s64 %rd5, %rd55, 63;
+; CHECK-NEXT:    or.b64 %rd56, %rd3, %rd4;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd56, 0;
+; CHECK-NEXT:    or.b64 %rd57, %rd1, %rd2;
+; CHECK-NEXT:    setp.eq.s64 %p4, %rd57, 0;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
 ; CHECK-NEXT:    setp.ne.s64 %p6, %rd4, 0;
 ; CHECK-NEXT:    clz.b64 %r1, %rd4;
-; CHECK-NEXT:    cvt.u64.u32 %rd59, %r1;
+; CHECK-NEXT:    cvt.u64.u32 %rd58, %r1;
 ; CHECK-NEXT:    clz.b64 %r2, %rd3;
-; CHECK-NEXT:    cvt.u64.u32 %rd60, %r2;
-; CHECK-NEXT:    add.s64 %rd61, %rd60, 64;
-; CHECK-NEXT:    selp.b64 %rd62, %rd59, %rd61, %p6;
+; CHECK-NEXT:    cvt.u64.u32 %rd59, %r2;
+; CHECK-NEXT:    add.s64 %rd60, %rd59, 64;
+; CHECK-NEXT:    selp.b64 %rd61, %rd58, %rd60, %p6;
 ; CHECK-NEXT:    setp.ne.s64 %p7, %rd2, 0;
 ; CHECK-NEXT:    clz.b64 %r3, %rd2;
-; CHECK-NEXT:    cvt.u64.u32 %rd63, %r3;
+; CHECK-NEXT:    cvt.u64.u32 %rd62, %r3;
 ; CHECK-NEXT:    clz.b64 %r4, %rd1;
-; CHECK-NEXT:    cvt.u64.u32 %rd64, %r4;
-; CHECK-NEXT:    add.s64 %rd65, %rd64, 64;
-; CHECK-NEXT:    selp.b64 %rd66, %rd63, %rd65, %p7;
-; CHECK-NEXT:    sub.cc.s64 %rd67, %rd62, %rd66;
+; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
+; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
+; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
+; CHECK-NEXT:    mov.b64 %rd112, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd67, %rd61, %rd65;
 ; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
 ; CHECK-NEXT:    setp.gt.u64 %p8, %rd67, 127;
 ; CHECK-NEXT:    setp.eq.s64 %p9, %rd68, 0;
diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll
index a86c06c24ed98..854d0860bc596 100644
--- a/llvm/test/CodeGen/NVPTX/szext.ll
+++ b/llvm/test/CodeGen/NVPTX/szext.ll
@@ -14,7 +14,7 @@ define i32 @szext_wrap_u32(i32 %a, i32 %b) {
 ; CHECK-NEXT:    szext.wrap.u32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
-  %c = call i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b)
+  %c = call i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b)
   ret i32 %c
 }
 
@@ -29,7 +29,7 @@ define i32 @szext_clamp_u32(i32 %a, i32 %b) {
 ; CHECK-NEXT:    szext.clamp.u32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
-  %c = call i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b)
+  %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b)
   ret i32 %c
 }
 
@@ -44,7 +44,7 @@ define i32 @szext_wrap_s32(i32 %a, i32 %b) {
 ; CHECK-NEXT:    szext.wrap.s32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
-  %c = call i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b)
+  %c = call i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b)
   ret i32 %c
 }
 
@@ -59,7 +59,48 @@ define i32 @szext_clamp_s32(i32 %a, i32 %b) {
 ; CHECK-NEXT:    szext.clamp.s32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
-  %c = call i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b)
+  %c = call i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b)
   ret i32 %c
 }
 
+define i32 @szext_clamp_s32_ii() {
+; CHECK-LABEL: szext_clamp_s32_ii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 3;
+; CHECK-NEXT:    szext.clamp.s32 %r2, %r1, 4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.sext.clamp(i32 3, i32 4)
+  ret i32 %c
+}
+
+define i32 @szext_wrap_s32_ir(i32 %a) {
+; CHECK-LABEL: szext_wrap_s32_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_wrap_s32_ir_param_0];
+; CHECK-NEXT:    szext.wrap.s32 %r2, 5, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.sext.wrap(i32 5, i32 %a)
+  ret i32 %c
+}
+
+define i32 @szext_clamp_u32_ri(i32 %a) {
+; CHECK-LABEL: szext_clamp_u32_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_clamp_u32_ri_param_0];
+; CHECK-NEXT:    szext.clamp.u32 %r2, %r1, 7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 7)
+  ret i32 %c
+}

From ca00d92b342e5cbe69c9fd4385d8834ed0c39615 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Fri, 9 May 2025 03:11:21 +0000
Subject: [PATCH 3/3] update test

---
 llvm/test/CodeGen/NVPTX/szext.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll
index 854d0860bc596..f159156c6b80f 100644
--- a/llvm/test/CodeGen/NVPTX/szext.ll
+++ b/llvm/test/CodeGen/NVPTX/szext.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
 
-target triple = "nvptx-unknown-cuda"
+target triple = "nvptx64-unknown-cuda"
 
 define i32 @szext_wrap_u32(i32 %a, i32 %b) {
 ; CHECK-LABEL: szext_wrap_u32(