Skip to content
45 changes: 45 additions & 0 deletions llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/FormatVariadic.h"

Expand Down Expand Up @@ -106,6 +107,42 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
return true;
}

// This function lowers llvm.fpbuiltin. intrinsic functions with 3.0 max-error
// attribute to the appropriate nvvm approximate intrinsics if it's possible.
static bool replaceWithApproxNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
IRBuilder<> IRBuilder(&BuiltinCall);
SmallVector<Value *> Args(BuiltinCall.args());
Value *Replacement = nullptr;
auto *Type = BuiltinCall.getType();
// For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have
// approximate variants for sin, cos, exp2 and log2.
// For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, use
// standart for LLVM math operations. Also nvvm fdiv and sqrt intrisics
// support only float type.
switch (BuiltinCall.getIntrinsicID()) {
case Intrinsic::fpbuiltin_fdiv:
if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
return replaceWithLLVMIR(BuiltinCall);
Replacement =
IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args);
break;
case Intrinsic::fpbuiltin_sqrt:
if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
return replaceWithLLVMIR(BuiltinCall);
Replacement = IRBuilder.CreateIntrinsic(
BuiltinCall.getType(), Intrinsic::nvvm_sqrt_approx_f, Args);
break;
default:
return false;
}
BuiltinCall.replaceAllUsesWith(Replacement);
cast<Instruction>(Replacement)->copyFastMathFlags(&BuiltinCall);
LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
<< BuiltinCall.getCalledFunction()->getName()
<< "` with equivalent IR. \n `");
return true;
}

static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
const TargetTransformInfo &TTI,
FPBuiltinIntrinsic &BuiltinCall) {
Expand Down Expand Up @@ -154,6 +191,14 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
}
}

// We don't have implementation for CUDA approximate precision builtins.
// Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
// - skip to replaceWithAltMathFunction.
if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) {
if (replaceWithApproxNVPTXCalls(BuiltinCall))
return true;
}

/// Call TLI to select a function implementation to call
StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall);
if (ImplName.empty()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put this test in CodeGen as Andy was adding his tests for fpbuiltin there (including those, which were testing just transformations without invoking llc)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer those tests to be in Transform instead, but I'm not against adding a single new test into CodeGen simply because they should either all be in the right location, or none of them (i.e. the move should be done as a separate PR).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally agree, lets keep the tests in one place for now. If necessary we will move some of them to Transform dir


target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

; CHECK-LABEL: @test_fdiv
; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}})
; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}}
define void @test_fdiv(float %d1, <2 x float> %v2d1,
float %d2, <2 x float> %v2d2) {
entry:
%t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0
%t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0
ret void
}

; CHECK-LABEL: @test_fdiv_fast
; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}})
; CHECK: %{{.*}} = fdiv fast <2 x float> %{{.*}}, %{{.*}}
define void @test_fdiv_fast(float %d1, <2 x float> %v2d1,
float %d2, <2 x float> %v2d2) {
entry:
%t0 = call fast float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0
%t1 = call fast <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0
ret void
}

declare float @llvm.fpbuiltin.fdiv.f32(float, float)
declare <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float>, <2 x float>)

; CHECK-LABEL: @test_fdiv_double
; CHECK: %{{.*}} = fdiv double %{{.*}}, %{{.*}}
; CHECK: %{{.*}} = fdiv <2 x double> %{{.*}}, %{{.*}}
define void @test_fdiv_double(double %d1, <2 x double> %v2d1,
double %d2, <2 x double> %v2d2) {
entry:
%t0 = call double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0
%t1 = call <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
ret void
}

declare double @llvm.fpbuiltin.fdiv.f64(double, double)
declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>)

; CHECK-LABEL: @test_sqrt
; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}})
; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
define void @test_sqrt(float %d, <2 x float> %v2d, <4 x float> %v4d) {
entry:
%t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #0
%t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0
ret void
}

declare float @llvm.fpbuiltin.sqrt.f32(float)
declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>)

; CHECK-LABEL: @test_sqrt_double
; CHECK: %{{.*}} = call double @llvm.sqrt.f64(double %{{.*}})
; CHECK: %{{.*}} = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}})
define void @test_sqrt_double(double %d, <2 x double> %v2d) {
entry:
%t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0
%t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0
ret void
}

declare double @llvm.fpbuiltin.sqrt.f64(double)
declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)

attributes #0 = { "fpbuiltin-max-error"="3.0" }
Loading