Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/c_backend/pipeline_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Pipeline : public Halide::Generator<Pipeline> {
Var x, y;

Func f, h;
f(x, y) = (input(clamp(x + 2, 0, input.dim(0).extent() - 1), clamp(y - 2, 0, input.dim(1).extent() - 1)) * 17) / 13;
f(x, y) = (input(clamp(x + 2, 0, input.dim(0).extent() - 1), clamp(y - 2, 0, input.dim(1).extent() - 1)) * 17) / 13 + cast<uint16_t>(x % 3.4f + fma(cast<float>(y), 0.5f, 1.2f));
h.define_extern("an_extern_stage", {f}, Int(16), 0, NameMangling::C);
output(x, y) = cast<uint16_t>(max(0, f(y, x) + f(x, y) + an_extern_func(x, y) + h()));

Expand Down
1 change: 1 addition & 0 deletions python_bindings/src/halide/halide_/PyIROperator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ void define_operators(py::module &m) {
m.def("log", &log);
m.def("pow", &pow);
m.def("erf", &erf);
m.def("fma", &fma);
m.def("fast_sin", &fast_sin);
m.def("fast_cos", &fast_cos);
m.def("fast_log", &fast_log);
Expand Down
25 changes: 23 additions & 2 deletions src/CodeGen_C.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1351,7 +1351,12 @@ void CodeGen_C::visit(const Mod *op) {
string arg0 = print_expr(op->a);
string arg1 = print_expr(op->b);
ostringstream rhs;
rhs << "fmod(" << arg0 << ", " << arg1 << ")";
if (op->type.is_scalar()) {
rhs << "::halide_cpp_fmod(";
} else {
rhs << print_type(op->type) << "_ops::fmod(";
}
rhs << arg0 << ", " << arg1 << ")";
print_assignment(op->type, rhs.str());
} else {
visit_binop(op->type, op->a, op->b, "%");
Expand Down Expand Up @@ -1845,8 +1850,24 @@ void CodeGen_C::visit(const Call *op) {
<< " + " << print_expr(base_offset) << "), /*rw*/0, /*locality*/0), 0)";
} else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
rhs << "(sizeof(halide_buffer_t))";
} else if (op->is_intrinsic(Call::strict_fma)) {
internal_assert(op->args.size() == 3)
<< "Wrong number of args for strict_fma: " << op->args.size();
if (op->type.is_scalar()) {
rhs << "::halide_cpp_fma("
<< print_expr(op->args[0]) << ", "
<< print_expr(op->args[1]) << ", "
<< print_expr(op->args[2]) << ")";
} else {
rhs << print_type(op->type) << "_ops::fma("
<< print_expr(op->args[0]) << ", "
<< print_expr(op->args[1]) << ", "
<< print_expr(op->args[2]) << ")";
}
} else if (op->is_strict_float_intrinsic()) {
// This depends on the generated C++ being compiled without -ffast-math
// This depends on the generated C++ being compiled without
// -ffast-math. Note that this would not be correct for strict_fma, so
// we handle it separately above.
Expr equiv = unstrictify_float(op);
rhs << print_expr(equiv);
} else if (op->is_intrinsic()) {
Expand Down
31 changes: 30 additions & 1 deletion src/CodeGen_C_prologue.template.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
/* MACHINE GENERATED By Halide. */

#if !(__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
#error "This code requires C++11 (or later); please upgrade your compiler."
#endif

#if !defined(__has_builtin)
#define __has_builtin(x) 0
#endif

#include <assert.h>
#include <fenv.h>
#include <float.h>
Expand Down Expand Up @@ -257,6 +260,32 @@ inline T halide_cpp_min(const T &a, const T &b) {
return (a < b) ? a : b;
}

template<typename T>
inline T halide_cpp_fma(const T &a, const T &b, const T &c) {
#if __has_builtin(__builtin_fma)
return __builtin_fma(a, b, c);
#else
if (sizeof(T) == sizeof(float)) {
return fmaf(a, b, c);
} else {
return (T)fma((double)a, (double)b, (double)c);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious: what's the point of casting? It looks like this would make it accept long double, but actually not respect the required precision (which is hard on SSE fp either way).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was for float16 support. It's not quite right doing it in a wider type though - the rounding on the wider fma might result in a tie when casting back to the narrow type, and that tie may break in a different direction than directly rounding the fma result to the narrow type. Not sure how to handle this. A static assert that T is a double or a float? What should the C backend do if you use a float16 fma call?

}
#endif
}

template<typename T>
inline T halide_cpp_fmod(const T &a, const T &b) {
#if __has_builtin(__builtin_fmod)
return __builtin_fmod(a, b);
#else
if (sizeof(T) == sizeof(float)) {
return fmod(a, b);
} else {
return (T)fmod((double)a, (double)b);
}
#endif
}

template<typename T>
inline void halide_maybe_unused(const T &) {
}
Expand Down
36 changes: 32 additions & 4 deletions src/CodeGen_C_vectors.template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
#define __has_attribute(x) 0
#endif

#if !defined(__has_builtin)
#define __has_builtin(x) 0
#endif

namespace {

// We can't use std::array because that has its own overload of operator<, etc,
Expand Down Expand Up @@ -150,6 +146,22 @@ class CppVectorOps {
return r;
}

static Vec fma(const Vec &a, const Vec &b, const Vec &c) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fma(a[i], b[i], c[i]);
}
return r;
}

static Vec fmod(const Vec &a, const Vec &b) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fmod(a[i], b[i]);
}
return r;
}

static Mask logical_or(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
Expand Down Expand Up @@ -734,6 +746,22 @@ class NativeVectorOps {
#endif
}

static Vec fma(const Vec a, const Vec b, const Vec c) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fma(a[i], b[i], c[i]);
}
return r;
}

static Vec fmod(const Vec a, const Vec b) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_fmod(a[i], b[i]);
}
return r;
}

// The relational operators produce signed-int of same width as input; our codegen expects uint8.
static Mask logical_or(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
Expand Down
4 changes: 4 additions & 0 deletions src/CodeGen_D3D12Compute_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
void CodeGen_D3D12Compute_Dev::init_module() {
debug(2) << "D3D12Compute device codegen init_module\n";

// TODO: we could support strict float intrinsics with the precise qualifier
internal_assert(!any_strict_float)
<< "strict float intrinsics not yet supported in d3d12compute backend";

// wipe the internal kernel source
src_stream.str("");
src_stream.clear();
Expand Down
14 changes: 14 additions & 0 deletions src/CodeGen_GPU_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,20 @@ void CodeGen_GPU_C::visit(const Call *op) {
equiv.accept(this);
}
}
} else if (op->is_intrinsic(Call::strict_fma)) {
// All shader languages have fma
Expr equiv = Call::make(op->type, "fma", op->args, Call::PureExtern);
equiv.accept(this);
} else {
CodeGen_C::visit(op);
}
}

void CodeGen_GPU_C::visit(const Mod *op) {
if (op->type.is_float()) {
// All shader languages have fmod
Expr equiv = Call::make(op->type, "fmod", {op->a, op->b}, Call::PureExtern);
equiv.accept(this);
} else {
CodeGen_C::visit(op);
}
Expand Down
10 changes: 10 additions & 0 deletions src/CodeGen_GPU_Dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,15 @@ struct CodeGen_GPU_Dev {
Device = 1, // Device/global memory fence
Shared = 2 // Threadgroup/shared memory fence
};

/** Some GPU APIs need to know what floating point mode we're in at kernel
* emission time, to emit appropriate pragmas. */
bool any_strict_float = false;

public:
void set_any_strict_float(bool any_strict_float) {
this->any_strict_float = any_strict_float;
}
};

/** A base class for GPU backends that require C-like shader output.
Expand All @@ -99,6 +108,7 @@ class CodeGen_GPU_C : public CodeGen_C {
using CodeGen_C::visit;
void visit(const Shuffle *op) override;
void visit(const Call *op) override;
void visit(const Mod *op) override;

std::string print_extern_call(const Call *op) override;

Expand Down
40 changes: 26 additions & 14 deletions src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3320,10 +3320,16 @@ void CodeGen_LLVM::visit(const Call *op) {
}
}

Expr call = Call::make(op->type, op->name, new_args, op->call_type);
{
ScopedValue<bool> old_in_strict_float(in_strict_float, true);
value = codegen(unstrictify_float(call.as<Call>()));
if (op->is_intrinsic(Call::strict_fma)) {
std::string name = "llvm.fma" + mangle_llvm_type(llvm_type_of(op->type));
value = call_intrin(op->type, op->type.lanes(), name, new_args);
} else {
// Lower to something other than a call node
Expr call = Call::make(op->type, op->name, new_args, op->call_type);
value = codegen(unstrictify_float(call.as<Call>()));
}
}

for (size_t i = 0; i < op->args.size(); i++) {
Expand Down Expand Up @@ -4729,23 +4735,29 @@ Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes,
Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes,
const string &name, vector<Value *> arg_values,
bool scalable_vector_result, bool is_reduction) {
auto fix_vector_lanes_of_type = [&](const llvm::Type *t) {
if (intrin_lanes == 1 || is_reduction) {
return t->getScalarType();
} else {
if (scalable_vector_result && effective_vscale != 0) {
return get_vector_type(result_type->getScalarType(),
intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
} else {
return get_vector_type(result_type->getScalarType(),
intrin_lanes, VectorTypeConstraint::Fixed);
}
}
};

llvm::Function *fn = module->getFunction(name);
if (!fn) {
vector<llvm::Type *> arg_types(arg_values.size());
for (size_t i = 0; i < arg_values.size(); i++) {
arg_types[i] = arg_values[i]->getType();
llvm::Type *t = arg_values[i]->getType();
arg_types[i] = fix_vector_lanes_of_type(t);
}

llvm::Type *intrinsic_result_type = result_type->getScalarType();
if (intrin_lanes > 1 && !is_reduction) {
if (scalable_vector_result && effective_vscale != 0) {
intrinsic_result_type = get_vector_type(result_type->getScalarType(),
intrin_lanes / effective_vscale, VectorTypeConstraint::VScale);
} else {
intrinsic_result_type = get_vector_type(result_type->getScalarType(),
intrin_lanes, VectorTypeConstraint::Fixed);
}
}
llvm::Type *intrinsic_result_type = fix_vector_lanes_of_type(result_type);
FunctionType *func_t = FunctionType::get(intrinsic_result_type, arg_types, false);
fn = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
fn->setCallingConv(CallingConv::C);
Expand Down Expand Up @@ -4780,7 +4792,7 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
if (arg_i_lanes >= arg_lanes) {
// Horizontally reducing intrinsics may have
// arguments that have more lanes than the
// result. Assume that the horizontally reduce
// result. Assume that they horizontally reduce
// neighboring elements...
int reduce = arg_i_lanes / arg_lanes;
args.push_back(slice_vector(arg_values[i], start * reduce, intrin_lanes * reduce));
Expand Down
1 change: 1 addition & 0 deletions src/CodeGen_Metal_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,7 @@ void CodeGen_Metal_Dev::init_module() {

// Write out the Halide math functions.
src_stream << "#pragma clang diagnostic ignored \"-Wunused-function\"\n"
<< "#pragma METAL fp math_mode(" << (any_strict_float ? "safe)\n" : "fast)\n")
<< "#include <metal_stdlib>\n"
<< "using namespace metal;\n" // Seems like the right way to go.
<< "namespace {\n"
Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_OpenCL_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ void CodeGen_OpenCL_Dev::init_module() {
// This identifies the program as OpenCL C (as opposed to SPIR).
src_stream << "/*OpenCL C " << target.to_string() << "*/\n";

src_stream << "#pragma OPENCL FP_CONTRACT ON\n";
src_stream << "#pragma OPENCL FP_CONTRACT " << (any_strict_float ? "OFF\n" : "ON\n");

// Write out the Halide math functions.
src_stream << "inline float float_from_bits(unsigned int x) {return as_float(x);}\n"
Expand Down
25 changes: 20 additions & 5 deletions src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,12 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
}

void CodeGen_PTX_Dev::init_module() {
// This class uses multiple inheritance. It's a GPU device code generator,
// and also an llvm-based one. Both of these track strict_float presence,
// but OffloadGPULoops only sets the GPU device code generator flag, so here
// we set the CodeGen_LLVM flag to match.
CodeGen_LLVM::any_strict_float = CodeGen_GPU_Dev::any_strict_float;

init_context();

module = get_initial_module_for_ptx_device(target, context);
Expand Down Expand Up @@ -249,6 +255,15 @@ void CodeGen_PTX_Dev::init_module() {
function_does_not_access_memory(fn);
fn->addFnAttr(llvm::Attribute::NoUnwind);
}

if (CodeGen_GPU_Dev::any_strict_float) {
debug(0) << "Setting strict fp math\n";
set_strict_fp_math();
in_strict_float = target.has_feature(Target::StrictFloat);
} else {
debug(0) << "Setting fast fp math\n";
set_fast_fp_math();
}
}

void CodeGen_PTX_Dev::visit(const Call *op) {
Expand Down Expand Up @@ -611,13 +626,13 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
internal_assert(llvm_target) << "Could not create LLVM target for " << triple.str() << "\n";

TargetOptions options;
options.AllowFPOpFusion = FPOpFusion::Fast;
options.AllowFPOpFusion = CodeGen_GPU_Dev::any_strict_float ? llvm::FPOpFusion::Strict : llvm::FPOpFusion::Fast;
#if LLVM_VERSION < 210
options.UnsafeFPMath = true;
options.UnsafeFPMath = !CodeGen_GPU_Dev::any_strict_float;
#endif
options.NoInfsFPMath = true;
options.NoNaNsFPMath = true;
options.HonorSignDependentRoundingFPMathOption = false;
options.NoInfsFPMath = !CodeGen_GPU_Dev::any_strict_float;
options.NoNaNsFPMath = !CodeGen_GPU_Dev::any_strict_float;
options.HonorSignDependentRoundingFPMathOption = !CodeGen_GPU_Dev::any_strict_float;
options.NoZerosInBSS = false;
options.GuaranteedTailCallOpt = false;

Expand Down
12 changes: 9 additions & 3 deletions src/CodeGen_Vulkan_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
{"fast_pow_f32", GLSLstd450Pow},
{"floor_f16", GLSLstd450Floor},
{"floor_f32", GLSLstd450Floor},
{"fma", GLSLstd450Fma},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seeing this, sitting between the others, makes me wonder: should we have fma_f32, fma_f64, fma_f16 functions instead of simply "fma"? Halide uses these suffixes pretty much everywhere.

Edit: hmm, it seems that this is not the case for the strict_float intrinsics. We didn't do strict_add_f32. And I think that's fine.

{"log_f16", GLSLstd450Log},
{"log_f32", GLSLstd450Log},
{"sin_f16", GLSLstd450Sin},
Expand Down Expand Up @@ -1190,9 +1191,14 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
e.accept(this);
}
} else if (op->is_strict_float_intrinsic()) {
// TODO: Enable/Disable RelaxedPrecision flags?
Expr e = unstrictify_float(op);
e.accept(this);
if (op->is_intrinsic(Call::strict_fma)) {
Expr builtin_call = Call::make(op->type, "fma", op->args, Call::PureExtern);
builtin_call.accept(this);
} else {
// TODO: Enable/Disable RelaxedPrecision flags?
Expr e = unstrictify_float(op);
e.accept(this);
}
} else if (op->is_intrinsic(Call::IntrinsicOp::sorted_avg)) {
internal_assert(op->args.size() == 2);
// b > a, so the following works without widening:
Expand Down
1 change: 1 addition & 0 deletions src/IR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,7 @@ const char *const intrinsic_op_names[] = {
"strict_add",
"strict_div",
"strict_eq",
"strict_fma",
"strict_le",
"strict_lt",
"strict_max",
Expand Down
Loading
Loading