[LLVM] Add generic half precision intrinsic wrappers

njroussel · njroussel · commit 604eab1db37c · 2026-02-13T11:39:15.000+01:00
diff --git a/src/llvm_coop_vec.cpp b/src/llvm_coop_vec.cpp
@@ -227,10 +227,7 @@ void jitc_llvm_render_coop_vec(const Variable *v, const Variable *a0,
                         bool custom_intrinsic = false;
 #if !defined(__aarch64__)
                         if ((VarType) a0->type == VarType::Float16) {
-                            if ((JitOp) v->literal == JitOp::Min)
-                                def_minnum_vec_f16_intrinsic();
-                            else
-                                def_maxnum_vec_f16_intrinsic();
+                            def_f16_wrapper_binary_intrinsic(op);
                             custom_intrinsic = true;
                         }
 #endif
@@ -254,7 +251,7 @@ void jitc_llvm_render_coop_vec(const Variable *v, const Variable *a0,
                 bool custom_intrinsic = false;
 #if !defined(__aarch64__)
                 if ((VarType) a0->type == VarType::Float16) {
-                    def_fma_vec_f16_intrinsic();
+                    def_f16_wrapper_ternary_intrinsic("fma");
                     custom_intrinsic = true;
                 }
 #endif
@@ -452,7 +449,7 @@ void jitc_llvm_render_coop_vec(const Variable *v, const Variable *a0,
                 bool custom_intrinsic = false;
 #if !defined(__aarch64__)
                 if ((VarType) v->type == VarType::Float16) {
-                    def_fma_vec_f16_intrinsic();
+                    def_f16_wrapper_ternary_intrinsic("fma");
                     custom_intrinsic = true;
                 }
 #endif
diff --git a/src/llvm_eval.h b/src/llvm_eval.h
@@ -13,39 +13,30 @@
     } while (0)
 
 #if !defined(__aarch64__)
+#define def_f16_wrapper_binary_intrinsic(op_str)                               \
+    do {                                                                       \
+        fmt_intrinsic("declare <$w x float> @llvm.$s.v$wf32(<$w x float>, <$w x float>)", op_str); \
+        fmt_intrinsic(                                                         \
+            "define internal <$w x half> @$s.v$wf16(<$w x half> %a, <$w x half> %b) #0 ${\n" \
+            "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"             \
+            "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"             \
+            "    %out_f32 = call fast <$w x float> @llvm.$s.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32)\n" \
+            "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"        \
+            "    ret <$w x half> %out\n"                                       \
+            "$}", op_str, op_str); \
+    } while (0)
 
-#define def_fma_vec_f16_intrinsic()                                            \
-    fmt_intrinsic(                                                             \
-        "define internal <$w x half> @fma.v$wf16(<$w x half> %a, <$w x half> %b, <$w x half> %c) #0 ${\n" \
-        "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"                 \
-        "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"                 \
-        "    %c_f32  = fpext <$w x half> %c to <$w x float>\n"                 \
-        "    %out_f32 = call fast <$w x float> @llvm.fma.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32, <$w x float> %c_f32)\n" \
-        "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"            \
-        "    ret <$w x half> %out\n"                                           \
-        "$}"                                                                   \
-    )
-
-#define def_minnum_vec_f16_intrinsic()                                         \
-    fmt_intrinsic(                                                             \
-        "define internal <$w x half> @minnum.v$wf16(<$w x half> %a, <$w x half> %b) local_unnamed_addr #0 ${\n" \
-        "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"                 \
-        "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"                 \
-        "    %out_f32 = call fast <$w x float> @llvm.minnum.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32)\n" \
-        "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"            \
-        "    ret <$w x half> %out\n"                                           \
-        "$}"                                                                   \
-    )
-
-#define def_maxnum_vec_f16_intrinsic()                                         \
-    fmt_intrinsic(                                                             \
-        "define internal <$w x half> @maxnum.v$wf16(<$w x half> %a, <$w x half> %b) local_unnamed_addr #0 ${\n" \
-        "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"                 \
-        "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"                 \
-        "    %out_f32 = call fast <$w x float> @llvm.maxnum.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32)\n" \
-        "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"            \
-        "    ret <$w x half> %out\n"                                           \
-        "$}"                                                                   \
-    )
-
+#define def_f16_wrapper_ternary_intrinsic(op_str)                              \
+    do {                                                                       \
+        fmt_intrinsic("declare <$w x float> @llvm.$s.v$wf32(<$w x float>, <$w x float>, <$w x float>)", op_str); \
+        fmt_intrinsic(                                                         \
+            "define internal <$w x half> @$s.v$wf16(<$w x half> %a, <$w x half> %b, <$w x half> %c) #0 ${\n" \
+            "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"             \
+            "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"             \
+            "    %c_f32  = fpext <$w x half> %c to <$w x float>\n"             \
+            "    %out_f32 = call fast <$w x float> @llvm.$s.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32, <$w x float> %c_f32)\n" \
+            "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"        \
+            "    ret <$w x half> %out\n"                                       \
+            "$}", op_str, op_str); \
+    } while (0)
 #endif