diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index b7097308f6e89..f225b9e8bd268 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -378,6 +378,59 @@ right, and the least significant bits are extracted to produce a result that is the same size as the original arguments. The shift amount is the minimum of the value of %n and the bit width of the integer type. +'``llvm.nvvm.flo.u.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 %shiftamt) + declare i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 %shiftamt) + +Overview: +""""""""" + +The '``llvm.nvvm.flo.u``' family of intrinsics identifies the bit position of the +leading one, returning either it's offset from the most or least significant bit. + +Semantics: +"""""""""" + +The '``llvm.nvvm.flo.u``' family of intrinsics returns the bit position of the +most significant 1. If %shiftamt is true, The result is the shift amount needed +to left-shift the found bit into the most-significant bit position, otherwise +the result is the shift amount needed to right-shift the found bit into the +least-significant bit position. 0xffffffff is returned if no 1 bit is found. + +'``llvm.nvvm.flo.s.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 %shiftamt) + declare i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 %shiftamt) + +Overview: +""""""""" + +The '``llvm.nvvm.flo.s``' family of intrinsics identifies the bit position of the +leading non-sign bit, returning either it's offset from the most or least +significant bit. + +Semantics: +"""""""""" + +The '``llvm.nvvm.flo.s``' family of intrinsics returns the bit position of the +most significant 0 for negative inputs and the most significant 1 for +non-negative inputs. If %shiftamt is true, The result is the shift amount needed +to left-shift the found bit into the most-significant bit position, otherwise +the result is the shift amount needed to right-shift the found bit into the +least-significant bit position. 0xffffffff is returned if no 1 bit is found. Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 5164f873d00f4..fd0cbed8b2566 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1092,6 +1092,14 @@ let TargetPrefix = "nvvm" in { [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +// +// FLO - Find Leading One +// + foreach sign = ["s", "u"] in + def int_nvvm_flo_ # sign : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_anyint_ty, llvm_i1_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>]>; // // Convert diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f5ac3c4e96436..1f4938d9fcf5a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -6,6 +6,19 @@ // //===----------------------------------------------------------------------===// +// Utility class to wrap up information about a register and DAG type for more +// convenient iteration and parameterization +class RegTyInfo { + ValueType Ty = ty; + NVPTXRegClass RC = rc; + Operand Imm = imm; + int Size = ty.Size; +} + +def I32RT : RegTyInfo; +def I64RT : RegTyInfo; + + def immFloat0 : PatLeaf<(fpimm), [{ float f = (float)N->getValueAPF().convertToFloat(); return (f==0.0f); @@ -1299,6 +1312,25 @@ def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;", def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;", Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>; +// +// BFIND +// + +foreach t = [I32RT, I64RT] in { + foreach sign = ["s", "u"] in { + defvar flo_intrin = !cast("int_nvvm_flo_" # sign); + def BFIND_ # sign # t.Size + : NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src), + "bfind." # sign # t.Size # " \t$dst, $src;", + [(set (i32 Int32Regs:$dst), (flo_intrin (t.Ty t.RC:$src), 0))]>; + + def BFIND_SHIFTAMT_ # sign # t.Size + : NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src), + "bfind.shiftamt." # sign # t.Size # " \t$dst, $src;", + [(set (i32 Int32Regs:$dst), (flo_intrin (t.Ty t.RC:$src), -1))]>; + } +} + // // Convert // diff --git a/llvm/test/CodeGen/NVPTX/flo.ll b/llvm/test/CodeGen/NVPTX/flo.ll new file mode 100644 index 0000000000000..bc7f765e40ab4 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/flo.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +define i32 @flo_1(i32 %a) { +; CHECK-LABEL: flo_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [flo_1_param_0]; +; CHECK-NEXT: bfind.s32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 false) + ret i32 %r +} + + +define i32 @flo_2(i32 %a) { +; CHECK-LABEL: flo_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [flo_2_param_0]; +; CHECK-NEXT: bfind.shiftamt.s32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 true) + ret i32 %r +} + +define i32 @flo_3(i32 %a) { +; CHECK-LABEL: flo_3( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [flo_3_param_0]; +; CHECK-NEXT: bfind.u32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 false) + ret i32 %r +} + + +define i32 @flo_4(i32 %a) { +; CHECK-LABEL: flo_4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [flo_4_param_0]; +; CHECK-NEXT: bfind.shiftamt.u32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 true) + ret i32 %r +} + + + +define i32 @flo_5(i64 %a) { +; CHECK-LABEL: flo_5( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [flo_5_param_0]; +; CHECK-NEXT: bfind.s64 %r1, %rd1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 false) + ret i32 %r +} + + +define i32 @flo_6(i64 %a) { +; CHECK-LABEL: flo_6( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [flo_6_param_0]; +; CHECK-NEXT: bfind.shiftamt.s64 %r1, %rd1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 true) + ret i32 %r +} + +define i32 @flo_7(i64 %a) { +; CHECK-LABEL: flo_7( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [flo_7_param_0]; +; CHECK-NEXT: bfind.u64 %r1, %rd1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 false) + ret i32 %r +} + + +define i32 @flo_8(i64 %a) { +; CHECK-LABEL: flo_8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [flo_8_param_0]; +; CHECK-NEXT: bfind.shiftamt.u64 %r1, %rd1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %r = call i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 true) + ret i32 %r +} + +declare i32 @llvm.nvvm.flo.s.i32(i32, i1) +declare i32 @llvm.nvvm.flo.u.i32(i32, i1) +declare i32 @llvm.nvvm.flo.s.i64(i64, i1) +declare i32 @llvm.nvvm.flo.u.i64(i64, i1)