From e4e6b697b205c402b1f21f6a8080c7c5c245bca3 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Thu, 8 May 2025 15:54:57 +0000 Subject: [PATCH 1/3] [NVPTX] Add intrinsics for the szext instruction --- llvm/docs/NVPTXUsage.rst | 93 ++++++++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 11 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 74 +++++++++---------- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 15 ++++ llvm/test/CodeGen/NVPTX/szext.ll | 65 +++++++++++++++++ 5 files changed, 217 insertions(+), 41 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/szext.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index c1426823d87af..331a4b8e08883 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -568,6 +568,99 @@ to left-shift the found bit into the most-significant bit position, otherwise the result is the shift amount needed to right-shift the found bit into the least-significant bit position. 0xffffffff is returned if no 1 bit is found. +'``llvm.nvvm.zext.inreg.clamp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b) + +Overview: +""""""""" + +The '``llvm.nvvm.zext.inreg.clamp``' intrinsic extracts the low bits of the +input value, and zero-extends them back to the original width. + +Semantics: +"""""""""" + +The '``llvm.nvvm.zext.inreg.clamp``' returns the zero-extension of N lowest bits +of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N +is 0, the result is 0. + +'``llvm.nvvm.zext.inreg.wrap``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b) + +Overview: +""""""""" + +The '``llvm.nvvm.zext.inreg.wrap``' intrinsic extracts the low bits of the +input value, and zero-extends them back to the original width. + +Semantics: +"""""""""" + +The '``llvm.nvvm.zext.inreg.wrap``' returns the zero-extension of N lowest bits +of operand %a. N is the value of operand %b modulo 32. If N is 0, the result +is 0. + +'``llvm.nvvm.sext.inreg.clamp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b) + +Overview: +""""""""" + +The '``llvm.nvvm.sext.inreg.clamp``' intrinsic extracts the low bits of the +input value, and sign-extends them back to the original width. + +Semantics: +"""""""""" + +The '``llvm.nvvm.sext.inreg.clamp``' returns the sign-extension of N lowest bits +of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N +is 0, the result is 0. + + +'``llvm.nvvm.sext.inreg.wrap``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b) + +Overview: +""""""""" + +The '``llvm.nvvm.sext.inreg.wrap``' intrinsic extracts the low bits of the +input value, and sign-extends them back to the original width. + +Semantics: +"""""""""" + +The '``llvm.nvvm.sext.inreg.wrap``' returns the sign-extension of N lowest bits +of operand %a. N is the value of operand %b modulo 32. If N is 0, the result +is 0. + TMA family of Intrinsics ------------------------ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 8b87822d3fdda..65f0e2209fc6b 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1356,6 +1356,17 @@ let TargetPrefix = "nvvm" in { [llvm_anyint_ty, llvm_i1_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>]>; + +// +// szext +// + foreach ext = ["sext", "zext"] in + foreach mode = ["wrap", "clamp"] in + def int_nvvm_ # ext # _inreg_ # mode : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + // // Convert // diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 11d77599d4ac3..dae6c929eea9e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -240,26 +240,33 @@ def F16X2RT : RegTyInfo; def BF16X2RT : RegTyInfo; +multiclass I3Inst requires = []> { + defvar asmstr = op_str # " \t$dst, $a, $b;"; + + def rr : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (op_node t.Ty:$a, t.Ty:$b))]>, + Requires; + def ri : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), + asmstr, + [(set t.Ty:$dst, (op_node t.RC:$a, imm:$b))]>, + Requires; + if !not(commutative) then + def ir : + NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (op_node imm:$a, t.RC:$b))]>, + Requires; +} + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "" (e.g. "add.s64"). -multiclass I3 { - foreach t = [I16RT, I32RT, I64RT] in { - defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;"; - - def t.Ty # rr : - NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), - asmstr, - [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>; - def t.Ty # ri : - NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), - asmstr, - [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>; - if !not(commutative) then - def t.Ty # ir : - NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), - asmstr, - [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>; - } +multiclass I3 { + foreach t = [I16RT, I32RT, I64RT] in + defm t.Ty# : I3Inst; } class I16x2 : @@ -270,26 +277,11 @@ class I16x2 : // Template for instructions which take 3 int args. The instructions are // named ".s32" (e.g. "addc.cc.s32"). -multiclass ADD_SUB_INT_CARRY { +multiclass ADD_SUB_INT_CARRY { let hasSideEffects = 1 in { - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; - def i64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, i64:$b))]>, - Requires<[hasPTX<43>]>; - def i64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, imm:$b))]>, - Requires<[hasPTX<43>]>; + defm i32 : I3Inst; + defm i64 : I3Inst]>; } } @@ -847,12 +839,12 @@ defm SUB : I3<"sub.s", sub, /*commutative=*/ false>; def ADD16x2 : I16x2<"add.s", add>; // in32 and int64 addition and subtraction with carry-out. -defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; -defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; +defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc, commutative = true>; +defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>; // int32 and int64 addition and subtraction with carry-in and carry-out. -defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; -defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; +defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>; +defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>; defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 3eedb43e4c81a..12c886cb0ca4e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1678,6 +1678,21 @@ foreach t = [I32RT, I64RT] in { } } +// +// szext +// + +foreach sign = ["s", "u"] in { + foreach mode = ["wrap", "clamp"] in { + defvar ext = !if(!eq(sign, "s"), "sext", "zext"); + defvar intrin = !cast("int_nvvm_" # ext # "_inreg_" # mode); + defm SZEXT_ # sign # _ # mode + : I3Inst<"szext." # mode # "." # sign # "32", + intrin, I32RT, commutative = false, + requires = [hasSM<70>, hasPTX<76>]>; + } +} + // // Convert // diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll new file mode 100644 index 0000000000000..a86c06c24ed98 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/szext.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s + +target triple = "nvptx-unknown-cuda" + +define i32 @szext_wrap_u32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_wrap_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_u32_param_1]; +; CHECK-NEXT: szext.wrap.u32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @szext_clamp_u32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_clamp_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_u32_param_1]; +; CHECK-NEXT: szext.clamp.u32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @szext_wrap_s32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_wrap_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_s32_param_1]; +; CHECK-NEXT: szext.wrap.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @szext_clamp_s32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_clamp_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_s32_param_1]; +; CHECK-NEXT: szext.clamp.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b) + ret i32 %c +} + From abf18acf2d4c246a9d08b3c476323cc4f72647ab Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Thu, 8 May 2025 21:08:10 +0000 Subject: [PATCH 2/3] address comments --- llvm/docs/NVPTXUsage.rst | 93 ++++-------------------- llvm/include/llvm/IR/IntrinsicsNVVM.td | 2 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 67 +++++------------ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 2 +- llvm/test/CodeGen/NVPTX/i128.ll | 92 +++++++++++------------ llvm/test/CodeGen/NVPTX/szext.ll | 49 ++++++++++++- 6 files changed, 128 insertions(+), 177 deletions(-) diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 331a4b8e08883..b6222300e4d4a 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -568,98 +568,35 @@ to left-shift the found bit into the most-significant bit position, otherwise the result is the shift amount needed to right-shift the found bit into the least-significant bit position. 0xffffffff is returned if no 1 bit is found. -'``llvm.nvvm.zext.inreg.clamp``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" - -.. code-block:: llvm - - declare i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b) - -Overview: -""""""""" - -The '``llvm.nvvm.zext.inreg.clamp``' intrinsic extracts the low bits of the -input value, and zero-extends them back to the original width. - -Semantics: -"""""""""" - -The '``llvm.nvvm.zext.inreg.clamp``' returns the zero-extension of N lowest bits -of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N -is 0, the result is 0. - -'``llvm.nvvm.zext.inreg.wrap``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" - -.. code-block:: llvm - - declare i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b) - -Overview: -""""""""" - -The '``llvm.nvvm.zext.inreg.wrap``' intrinsic extracts the low bits of the -input value, and zero-extends them back to the original width. - -Semantics: -"""""""""" - -The '``llvm.nvvm.zext.inreg.wrap``' returns the zero-extension of N lowest bits -of operand %a. N is the value of operand %b modulo 32. If N is 0, the result -is 0. - -'``llvm.nvvm.sext.inreg.clamp``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" - -.. code-block:: llvm - - declare i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b) - -Overview: -""""""""" - -The '``llvm.nvvm.sext.inreg.clamp``' intrinsic extracts the low bits of the -input value, and sign-extends them back to the original width. - -Semantics: -"""""""""" - -The '``llvm.nvvm.sext.inreg.clamp``' returns the sign-extension of N lowest bits -of operand %a. N is the value of operand %b clamped to the range [0, 32]. If N -is 0, the result is 0. - - -'``llvm.nvvm.sext.inreg.wrap``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.nvvm.{zext,sext}.{wrap,clamp}``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" .. code-block:: llvm - declare i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b) + declare i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b) + declare i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b) + declare i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b) + declare i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b) Overview: """"""""" -The '``llvm.nvvm.sext.inreg.wrap``' intrinsic extracts the low bits of the -input value, and sign-extends them back to the original width. +The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics extracts the +low bits of the input value, and zero- or sign-extends them back to the original +width. Semantics: """""""""" -The '``llvm.nvvm.sext.inreg.wrap``' returns the sign-extension of N lowest bits -of operand %a. N is the value of operand %b modulo 32. If N is 0, the result -is 0. +The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics returns +extension of N lowest bits of operand %a. For the '``wrap``' variants, N is the +value of operand %b modulo 32. For the '``clamp``' variants, N is the value of +operand %b clamped to the range [0, 32]. The N lowest bits are then +zero-extended the case of the '``zext``' variants, or sign-extended the case of +the '``sext``' variants. If N is 0, the result is 0. TMA family of Intrinsics ------------------------ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 65f0e2209fc6b..2851206f2e84a 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1362,7 +1362,7 @@ let TargetPrefix = "nvvm" in { // foreach ext = ["sext", "zext"] in foreach mode = ["wrap", "clamp"] in - def int_nvvm_ # ext # _inreg_ # mode : + def int_nvvm_ # ext # _ # mode : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index dae6c929eea9e..a384cb79d645a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -227,6 +227,7 @@ class RegTyInfo; def I16RT : RegTyInfo; def I32RT : RegTyInfo; def I64RT : RegTyInfo; @@ -252,13 +253,13 @@ multiclass I3Inst, + [(set t.Ty:$dst, (op_node t.Ty:$a, (t.Ty imm:$b)))]>, Requires; if !not(commutative) then def ir : NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), asmstr, - [(set t.Ty:$dst, (op_node imm:$a, t.RC:$b))]>, + [(set t.Ty:$dst, (op_node (t.Ty imm:$a), t.Ty:$b))]>, Requires; } @@ -833,8 +834,8 @@ defm SUB_i1 : ADD_SUB_i1; // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we // also use these for unsigned arithmetic. -defm ADD : I3<"add.s", add, /*commutative=*/ true>; -defm SUB : I3<"sub.s", sub, /*commutative=*/ false>; +defm ADD : I3<"add.s", add, commutative = true>; +defm SUB : I3<"sub.s", sub, commutative = false>; def ADD16x2 : I16x2<"add.s", add>; @@ -846,18 +847,18 @@ defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>; defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>; defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>; -defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>; +defm MULT : I3<"mul.lo.s", mul, commutative = true>; -defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>; -defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>; +defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>; +defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>; -defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>; -defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>; +defm SDIV : I3<"div.s", sdiv, commutative = false>; +defm UDIV : I3<"div.u", udiv, commutative = false>; // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM // will lower it. -defm SREM : I3<"rem.s", srem, /*commutative=*/ false>; -defm UREM : I3<"rem.u", urem, /*commutative=*/ false>; +defm SREM : I3<"rem.s", srem, commutative = false>; +defm UREM : I3<"rem.u", urem, commutative = false>; // Integer absolute value. NumBits should be one minus the bit width of RC. // This idiom implements the algorithm at @@ -872,10 +873,10 @@ defm ABS_32 : ABS; defm ABS_64 : ABS; // Integer min/max. -defm SMAX : I3<"max.s", smax, /*commutative=*/ true>; -defm UMAX : I3<"max.u", umax, /*commutative=*/ true>; -defm SMIN : I3<"min.s", smin, /*commutative=*/ true>; -defm UMIN : I3<"min.u", umin, /*commutative=*/ true>; +defm SMAX : I3<"max.s", smax, commutative = true>; +defm UMAX : I3<"max.u", umax, commutative = true>; +defm SMIN : I3<"min.s", smin, commutative = true>; +defm UMIN : I3<"min.u", umin, commutative = true>; def SMAX16x2 : I16x2<"max.s", smax>; def UMAX16x2 : I16x2<"max.u", umax>; @@ -1385,38 +1386,10 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), // Template for three-arg bitwise operations. Takes three args, Creates .b16, // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. multiclass BITWISE { - def b1rr : - NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set i1:$dst, (OpNode i1:$a, i1:$b))]>; - def b1ri : - NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set i1:$dst, (OpNode i1:$a, imm:$b))]>; - def b16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, i16:$b))]>; - def b16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, imm:$b))]>; - def b32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; - def b32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; - def b64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, i64:$b))]>; - def b64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, imm:$b))]>; + defm b1 : I3Inst; + defm b16 : I3Inst; + defm b32 : I3Inst; + defm b64 : I3Inst; } defm OR : BITWISE<"or", or>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 12c886cb0ca4e..7b139d7b79e7d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1685,7 +1685,7 @@ foreach t = [I32RT, I64RT] in { foreach sign = ["s", "u"] in { foreach mode = ["wrap", "clamp"] in { defvar ext = !if(!eq(sign, "s"), "sext", "zext"); - defvar intrin = !cast("int_nvvm_" # ext # "_inreg_" # mode); + defvar intrin = !cast("int_nvvm_" # ext # "_" # mode); defm SZEXT_ # sign # _ # mode : I3Inst<"szext." # mode # "." # sign # "32", intrin, I32RT, commutative = false, diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 64786e601c4b5..f1ca19b30ac2a 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -13,37 +13,37 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1]; ; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; -; CHECK-NEXT: mov.b64 %rd117, 0; -; CHECK-NEXT: sub.cc.s64 %rd52, %rd117, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd53, %rd117, %rd46; +; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd53, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd3, %rd52, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd117, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd117, %rd50; +; CHECK-NEXT: selp.b64 %rd4, %rd52, %rd46, %p1; +; CHECK-NEXT: selp.b64 %rd3, %rd51, %rd45, %p1; +; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; +; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; ; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd6, %rd55, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd5, %rd54, %rd49, %p2; -; CHECK-NEXT: or.b64 %rd56, %rd5, %rd6; -; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0; -; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4; -; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0; +; CHECK-NEXT: selp.b64 %rd6, %rd54, %rd50, %p2; +; CHECK-NEXT: selp.b64 %rd5, %rd53, %rd49, %p2; +; CHECK-NEXT: or.b64 %rd55, %rd5, %rd6; +; CHECK-NEXT: setp.eq.s64 %p3, %rd55, 0; +; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; +; CHECK-NEXT: setp.eq.s64 %p4, %rd56, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; ; CHECK-NEXT: setp.ne.s64 %p6, %rd6, 0; ; CHECK-NEXT: clz.b64 %r1, %rd6; -; CHECK-NEXT: cvt.u64.u32 %rd58, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd57, %r1; ; CHECK-NEXT: clz.b64 %r2, %rd5; -; CHECK-NEXT: cvt.u64.u32 %rd59, %r2; -; CHECK-NEXT: add.s64 %rd60, %rd59, 64; -; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6; +; CHECK-NEXT: cvt.u64.u32 %rd58, %r2; +; CHECK-NEXT: add.s64 %rd59, %rd58, 64; +; CHECK-NEXT: selp.b64 %rd60, %rd57, %rd59, %p6; ; CHECK-NEXT: setp.ne.s64 %p7, %rd4, 0; ; CHECK-NEXT: clz.b64 %r3, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd62, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd61, %r3; ; CHECK-NEXT: clz.b64 %r4, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; -; CHECK-NEXT: add.s64 %rd64, %rd63, 64; -; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65; +; CHECK-NEXT: cvt.u64.u32 %rd62, %r4; +; CHECK-NEXT: add.s64 %rd63, %rd62, 64; +; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7; +; CHECK-NEXT: mov.b64 %rd117, 0; +; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64; ; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; ; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127; ; CHECK-NEXT: setp.eq.s64 %p9, %rd67, 0; @@ -314,39 +314,39 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1]; -; CHECK-NEXT: mov.b64 %rd112, 0; -; CHECK-NEXT: sub.cc.s64 %rd52, %rd112, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd53, %rd112, %rd46; +; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd2, %rd53, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd1, %rd52, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd112, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd112, %rd50; +; CHECK-NEXT: selp.b64 %rd2, %rd52, %rd46, %p1; +; CHECK-NEXT: selp.b64 %rd1, %rd51, %rd45, %p1; +; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; +; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; ; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd55, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd3, %rd54, %rd49, %p2; -; CHECK-NEXT: xor.b64 %rd56, %rd50, %rd46; -; CHECK-NEXT: shr.s64 %rd5, %rd56, 63; -; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4; -; CHECK-NEXT: setp.eq.s64 %p3, %rd57, 0; -; CHECK-NEXT: or.b64 %rd58, %rd1, %rd2; -; CHECK-NEXT: setp.eq.s64 %p4, %rd58, 0; +; CHECK-NEXT: selp.b64 %rd4, %rd54, %rd50, %p2; +; CHECK-NEXT: selp.b64 %rd3, %rd53, %rd49, %p2; +; CHECK-NEXT: xor.b64 %rd55, %rd50, %rd46; +; CHECK-NEXT: shr.s64 %rd5, %rd55, 63; +; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; +; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0; +; CHECK-NEXT: or.b64 %rd57, %rd1, %rd2; +; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; ; CHECK-NEXT: setp.ne.s64 %p6, %rd4, 0; ; CHECK-NEXT: clz.b64 %r1, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd59, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd58, %r1; ; CHECK-NEXT: clz.b64 %r2, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd60, %r2; -; CHECK-NEXT: add.s64 %rd61, %rd60, 64; -; CHECK-NEXT: selp.b64 %rd62, %rd59, %rd61, %p6; +; CHECK-NEXT: cvt.u64.u32 %rd59, %r2; +; CHECK-NEXT: add.s64 %rd60, %rd59, 64; +; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6; ; CHECK-NEXT: setp.ne.s64 %p7, %rd2, 0; ; CHECK-NEXT: clz.b64 %r3, %rd2; -; CHECK-NEXT: cvt.u64.u32 %rd63, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd62, %r3; ; CHECK-NEXT: clz.b64 %r4, %rd1; -; CHECK-NEXT: cvt.u64.u32 %rd64, %r4; -; CHECK-NEXT: add.s64 %rd65, %rd64, 64; -; CHECK-NEXT: selp.b64 %rd66, %rd63, %rd65, %p7; -; CHECK-NEXT: sub.cc.s64 %rd67, %rd62, %rd66; +; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; +; CHECK-NEXT: add.s64 %rd64, %rd63, 64; +; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; +; CHECK-NEXT: mov.b64 %rd112, 0; +; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65; ; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; ; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127; ; CHECK-NEXT: setp.eq.s64 %p9, %rd68, 0; diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll index a86c06c24ed98..854d0860bc596 100644 --- a/llvm/test/CodeGen/NVPTX/szext.ll +++ b/llvm/test/CodeGen/NVPTX/szext.ll @@ -14,7 +14,7 @@ define i32 @szext_wrap_u32(i32 %a, i32 %b) { ; CHECK-NEXT: szext.wrap.u32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; - %c = call i32 @llvm.nvvm.zext.inreg.wrap(i32 %a, i32 %b) + %c = call i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b) ret i32 %c } @@ -29,7 +29,7 @@ define i32 @szext_clamp_u32(i32 %a, i32 %b) { ; CHECK-NEXT: szext.clamp.u32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; - %c = call i32 @llvm.nvvm.zext.inreg.clamp(i32 %a, i32 %b) + %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b) ret i32 %c } @@ -44,7 +44,7 @@ define i32 @szext_wrap_s32(i32 %a, i32 %b) { ; CHECK-NEXT: szext.wrap.s32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; - %c = call i32 @llvm.nvvm.sext.inreg.wrap(i32 %a, i32 %b) + %c = call i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b) ret i32 %c } @@ -59,7 +59,48 @@ define i32 @szext_clamp_s32(i32 %a, i32 %b) { ; CHECK-NEXT: szext.clamp.s32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; - %c = call i32 @llvm.nvvm.sext.inreg.clamp(i32 %a, i32 %b) + %c = call i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b) ret i32 %c } +define i32 @szext_clamp_s32_ii() { +; CHECK-LABEL: szext_clamp_s32_ii( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b32 %r1, 3; +; CHECK-NEXT: szext.clamp.s32 %r2, %r1, 4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.clamp(i32 3, i32 4) + ret i32 %c +} + +define i32 @szext_wrap_s32_ir(i32 %a) { +; CHECK-LABEL: szext_wrap_s32_ir( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_ir_param_0]; +; CHECK-NEXT: szext.wrap.s32 %r2, 5, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.wrap(i32 5, i32 %a) + ret i32 %c +} + +define i32 @szext_clamp_u32_ri(i32 %a) { +; CHECK-LABEL: szext_clamp_u32_ri( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_ri_param_0]; +; CHECK-NEXT: szext.clamp.u32 %r2, %r1, 7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 7) + ret i32 %c +} From ca00d92b342e5cbe69c9fd4385d8834ed0c39615 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Fri, 9 May 2025 03:11:21 +0000 Subject: [PATCH 3/3] update test --- llvm/test/CodeGen/NVPTX/szext.ll | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll index 854d0860bc596..f159156c6b80f 100644 --- a/llvm/test/CodeGen/NVPTX/szext.ll +++ b/llvm/test/CodeGen/NVPTX/szext.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} -target triple = "nvptx-unknown-cuda" +target triple = "nvptx64-unknown-cuda" define i32 @szext_wrap_u32(i32 %a, i32 %b) { ; CHECK-LABEL: szext_wrap_u32(