[LLVM] Custom intrinsics for half-precision ops on x86

njroussel · njroussel · commit 5c0d17633b97 · 2026-02-13T11:00:32.000+01:00
diff --git a/src/llvm_eval.h b/src/llvm_eval.h
@@ -16,36 +16,36 @@
 
 #define def_fma_vec_f16_intrinsic()                                            \
     fmt_intrinsic(                                                             \
-        "define internal fastcc <$w x half> @fma.v$wf16(<$w x half> %a, <$w x half> %b, <$w x half> %c) #0 ${\n" \
+        "define internal <$w x half> @fma.v$wf16(<$w x half> %a, <$w x half> %b, <$w x half> %c) #0 ${\n" \
         "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"                 \
         "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"                 \
         "    %c_f32  = fpext <$w x half> %c to <$w x float>\n"                 \
         "    %out_f32 = call fast <$w x float> @llvm.fma.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32, <$w x float> %c_f32)\n" \
         "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"            \
         "    ret <$w x half> %out\n"                                           \
         "$}"                                                                   \
-    );
+    )
 
 #define def_minnum_vec_f16_intrinsic()                                         \
     fmt_intrinsic(                                                             \
-        "define internal fastcc <$w x half> @minnum.v$wf16(<$w x half> %a, <$w x half> %b) #0 ${\n" \
+        "define internal <$w x half> @minnum.v$wf16(<$w x half> %a, <$w x half> %b) local_unnamed_addr #0 ${\n" \
         "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"                 \
         "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"                 \
         "    %out_f32 = call fast <$w x float> @llvm.minnum.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32)\n" \
         "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"            \
         "    ret <$w x half> %out\n"                                           \
         "$}"                                                                   \
-    );
+    )
 
 #define def_maxnum_vec_f16_intrinsic()                                         \
     fmt_intrinsic(                                                             \
-        "define internal fastcc <$w x half> @maxnum.v$wf16(<$w x half> %a, <$w x half> %b) #0 ${\n" \
+        "define internal <$w x half> @maxnum.v$wf16(<$w x half> %a, <$w x half> %b) local_unnamed_addr #0 ${\n" \
         "    %a_f32  = fpext <$w x half> %a to <$w x float>\n"                 \
         "    %b_f32  = fpext <$w x half> %b to <$w x float>\n"                 \
         "    %out_f32 = call fast <$w x float> @llvm.maxnum.v$wf32(<$w x float> %a_f32, <$w x float> %b_f32)\n" \
         "    %out = fptrunc <$w x float> %out_f32 to <$w x half>\n"            \
         "    ret <$w x half> %out\n"                                           \
         "$}"                                                                   \
-    );
+    )
 
 #endif