[Arm64EC] Add support for half (#152843)

tgross35 · web-flow · commit 919021b0df8c · 2025-08-12T14:15:52.000-07:00
`f16` is passed and returned in vector registers on both x86 on AArch64, the same calling convention as `f32`, so it is a straightforward type to support. The calling convention support already exists, added as part of a6065f0 ("Arm64EC entry/exit thunks, consolidated. (#79067)"). Thus, add mangling and remove the error in order to make `half` work. MSVC does not yet support `_Float16`, so for now this will remain an LLVM-only extension. Fixes the `f16` portion of #94434
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -316,6 +316,12 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
                         ThunkArgTranslation::PointerIndirection};
   };
 
+  if (T->isHalfTy()) {
+    // Prefix with `llvm` since MSVC doesn't specify `_Float16`
+    Out << "__llvm_h__";
+    return direct(T);
+  }
+
   if (T->isFloatTy()) {
     Out << "f";
     return direct(T);
@@ -327,8 +333,8 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
   }
 
   if (T->isFloatingPointTy()) {
-    report_fatal_error(
-        "Only 32 and 64 bit floating points are supported for ARM64EC thunks");
+    report_fatal_error("Only 16, 32, and 64 bit floating points are supported "
+                       "for ARM64EC thunks");
   }
 
   auto &DL = M->getDataLayout();
@@ -342,8 +348,16 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
     uint64_t ElementCnt = T->getArrayNumElements();
     uint64_t ElementSizePerBytes = DL.getTypeSizeInBits(ElementTy) / 8;
     uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes;
-    if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) {
-      Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes;
+    if (ElementTy->isHalfTy() || ElementTy->isFloatTy() ||
+        ElementTy->isDoubleTy()) {
+      if (ElementTy->isHalfTy())
+        // Prefix with `llvm` since MSVC doesn't specify `_Float16`
+        Out << "__llvm_H__";
+      else if (ElementTy->isFloatTy())
+        Out << "F";
+      else if (ElementTy->isDoubleTy())
+        Out << "D";
+      Out << TotalSizeBytes;
       if (Alignment.value() >= 16 && !Ret)
         Out << "a" << Alignment.value();
       if (TotalSizeBytes <= 8) {
@@ -355,8 +369,9 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
         return pointerIndirection(T);
       }
     } else if (T->isFloatingPointTy()) {
-      report_fatal_error("Only 32 and 64 bit floating points are supported for "
-                         "ARM64EC thunks");
+      report_fatal_error(
+          "Only 16, 32, and 64 bit floating points are supported "
+          "for ARM64EC thunks");
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -85,10 +85,10 @@ define i64 @simple_integers(i8, i16, i32, i64) nounwind {
   ret i64 0
 }
 
-; NOTE: Only float and double are supported.
-define double @simple_floats(float, double) nounwind {
-; CHECK-LABEL:    .def    $ientry_thunk$cdecl$d$fd;
-; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$d$fd
+; NOTE: Only half, float, and double are supported.
+define double @simple_floats(half, float, double) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$d$__llvm_h__fd;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$d$__llvm_h__fd
 ; CHECK:          // %bb.0:
 ; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
 ; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
@@ -600,7 +600,7 @@ start:
 ; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$i8$i8i8i8i8
 ; CHECK-NEXT:     .word   1
 ; CHECK-NEXT:     .symidx "#simple_floats"
-; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$d$fd
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$d$__llvm_h__fd
 ; CHECK-NEXT:     .word   1
 ; CHECK-NEXT:     .symidx "#has_varargs"
 ; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$v$varargs
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
@@ -93,10 +93,10 @@ declare i64 @simple_integers(i8, i16, i32, i64) nounwind;
 ; CHECK-NEXT:     .seh_endfunclet
 ; CHECK-NEXT:     .seh_endproc
 
-; NOTE: Only float and double are supported.
-declare double @simple_floats(float, double) nounwind;
-; CHECK-LABEL:    .def    $iexit_thunk$cdecl$d$fd;
-; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$d$fd
+; NOTE: Only half, float, and double are supported.
+declare double @simple_floats(half, float, double) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$d$__llvm_h__fd;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$d$__llvm_h__fd
 ; CHECK:          // %bb.0:
 ; CHECK-NEXT:     sub     sp, sp, #48
 ; CHECK-NEXT:     .seh_stackalloc 48
@@ -129,8 +129,8 @@ declare double @simple_floats(float, double) nounwind;
 ; CHECK-NEXT:     adrp    x11, simple_floats
 ; CHECK-NEXT:     add     x11, x11, :lo12:simple_floats
 ; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
-; CHECK-NEXT:     adrp    x10, $iexit_thunk$cdecl$d$fd
-; CHECK-NEXT:     add     x10, x10, :lo12:$iexit_thunk$cdecl$d$fd
+; CHECK-NEXT:     adrp    x10, $iexit_thunk$cdecl$d$__llvm_h__fd
+; CHECK-NEXT:     add     x10, x10, :lo12:$iexit_thunk$cdecl$d$__llvm_h__fd
 ; CHECK-NEXT:     blr     x8
 ; CHECK-NEXT:     .seh_startepilogue
 ; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
@@ -282,33 +282,36 @@ declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind;
 ; CHECK:          .seh_endfunclet
 ; CHECK:          .seh_endproc
 
-declare [2 x i8] @small_array([2 x i8], [2 x float]) nounwind;
-; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m2$m2F8;
-; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m2$m2F8
+declare [2 x i8] @small_array([2 x i8], [2 x half], [2 x float]) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m2$m2__llvm_H__4F8;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m2$m2__llvm_H__4F8
 ; CHECK:          // %bb.0:
-; CHECK-NEXT:     sub     sp, sp, #64
-; CHECK-NEXT:     .seh_stackalloc 64
-; CHECK-NEXT:     stp     x29, x30, [sp, #48]             // 16-byte Folded Spill
-; CHECK-NEXT:     .seh_save_fplr  48
-; CHECK-NEXT:     add     x29, sp, #48
-; CHECK-NEXT:     .seh_add_fp     48
+; CHECK-NEXT:     sub     sp, sp, #80
+; CHECK-NEXT:     .seh_stackalloc 80
+; CHECK-NEXT:     stp     x29, x30, [sp, #64]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  64
+; CHECK-NEXT:     add     x29, sp, #64
+; CHECK-NEXT:     .seh_add_fp     64
 ; CHECK-NEXT:     .seh_endprologue
-; CHECK-NEXT:     sturb   w1, [x29, #-1]
-; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
-; CHECK-NEXT:     sturb   w0, [x29, #-2]
-; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
-; CHECK-NEXT:     stp     s0, s1, [x29, #-12]
-; CHECK-NEXT:     ldurh   w0, [x29, #-2]
-; CHECK-NEXT:     ldur    x1, [x29, #-12]
-; CHECK-NEXT:     blr     x16
-; CHECK-NEXT:     mov     w0, w8
-; CHECK-NEXT:     sturh   w8, [x29, #-14]
-; CHECK-NEXT:     ubfx    w1, w8, #8, #8
+; CHECK-NEXT:     sturb	w0, [x29, #-2]
+; CHECK-NEXT:     adrp	x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     sturb	w1, [x29, #-1]
+; CHECK-NEXT:     ldr	x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     stur	h0, [x29, #-6]
+; CHECK-NEXT:     ldurh	w0, [x29, #-2]
+; CHECK-NEXT:     stur	h1, [x29, #-4]
+; CHECK-NEXT:     stp	s2, s3, [x29, #-16]
+; CHECK-NEXT:     ldur	w1, [x29, #-6]
+; CHECK-NEXT:     ldur	x2, [x29, #-16]
+; CHECK-NEXT:     blr	x16
+; CHECK-NEXT:     mov	w0, w8
+; CHECK-NEXT:     sturh	w8, [x29, #-18]
+; CHECK-NEXT:     ubfx	w1, w8, #8, #8
 ; CHECK-NEXT:     .seh_startepilogue
-; CHECK-NEXT:     ldp     x29, x30, [sp, #48]             // 16-byte Folded Reload
-; CHECK-NEXT:     .seh_save_fplr  48
-; CHECK-NEXT:     add     sp, sp, #64
-; CHECK-NEXT:     .seh_stackalloc 64
+; CHECK-NEXT:     ldp	x29, x30, [sp, #64]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr	64
+; CHECK-NEXT:     add	sp, sp, #80
+; CHECK-NEXT:     .seh_stackalloc	80
 ; CHECK-NEXT:     .seh_endepilogue
 ; CHECK-NEXT:     ret
 ; CHECK-NEXT:     .seh_endfunclet
@@ -325,8 +328,8 @@ declare [2 x i8] @small_array([2 x i8], [2 x float]) nounwind;
 ; CHECK-NEXT:     adrp    x11, small_array
 ; CHECK-NEXT:     add     x11, x11, :lo12:small_array
 ; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
-; CHECK-NEXT:     adrp    x10, $iexit_thunk$cdecl$m2$m2F8
-; CHECK-NEXT:     add     x10, x10, :lo12:$iexit_thunk$cdecl$m2$m2F8
+; CHECK-NEXT:     adrp    x10, $iexit_thunk$cdecl$m2$m2__llvm_H__4F8
+; CHECK-NEXT:     add     x10, x10, :lo12:$iexit_thunk$cdecl$m2$m2__llvm_H__4F8
 ; CHECK-NEXT:     blr     x8
 ; CHECK-NEXT:     .seh_startepilogue
 ; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
@@ -577,7 +580,7 @@ declare <8 x i16> @large_vector(<8 x i16> %0) nounwind;
 ; CHECK-NEXT:     .symidx simple_integers
 ; CHECK-NEXT:     .word   0
 ; CHECK-NEXT:     .symidx simple_floats
-; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$d$fd
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$d$__llvm_h__fd
 ; CHECK-NEXT:     .word   4
 ; CHECK-NEXT:     .symidx "#simple_floats$exit_thunk"
 ; CHECK-NEXT:     .symidx simple_floats
@@ -601,7 +604,7 @@ declare <8 x i16> @large_vector(<8 x i16> %0) nounwind;
 ; CHECK-NEXT:     .symidx has_aligned_sret
 ; CHECK-NEXT:     .word   0
 ; CHECK-NEXT:     .symidx small_array
-; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m2$m2F8
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m2$m2__llvm_H__4F8
 ; CHECK-NEXT:     .word   4
 ; CHECK-NEXT:     .symidx "#small_array$exit_thunk"
 ; CHECK-NEXT:     .symidx small_array
@@ -634,14 +637,14 @@ declare <8 x i16> @large_vector(<8 x i16> %0) nounwind;
 define void @func_caller() nounwind {
   call void @no_op()
   call i64 @simple_integers(i8 0, i16 0, i32 0, i64 0)
-  call double @simple_floats(float 0.0, double 0.0)
+  call double @simple_floats(half 0.0, float 0.0, double 0.0)
   call void (...) @has_varargs()
   %c = alloca i8
   call void @has_sret(ptr sret([100 x i8]) %c)
   %aligned = alloca %TSRet, align 32
   store %TSRet { i64 0, i64 0 }, ptr %aligned, align 32
   call void @has_aligned_sret(ptr align 32 sret(%TSRet) %aligned)
-  call [2 x i8] @small_array([2 x i8] [i8 0, i8 0], [2 x float] [float 0.0, float 0.0])
+  call [2 x i8] @small_array([2 x i8] [i8 0, i8 0], [2 x half] [half 0.0, half 0.0], [2 x float] [float 0.0, float 0.0])
   call [3 x i64] @large_array([3 x i64] [i64 0, i64 0, i64 0], [2 x double] [double 0.0, double 0.0], [2 x [2 x i64]] [[2 x i64] [i64 0, i64 0], [2 x i64] [i64 0, i64 0]])
   call %T2 @simple_struct(%T1 { i16 0 }, %T2 { i32 0, float 0.0 }, %T3 { i64 0, double 0.0 }, %T4 { i64 0, double 0.0, i8 0 })
   call <4 x i8> @small_vector(<4 x i8> <i8 0, i8 0, i8 0, i8 0>)
diff --git a/llvm/test/CodeGen/AArch64/frexp-arm64ec.ll b/llvm/test/CodeGen/AArch64/frexp-arm64ec.ll
@@ -2,6 +2,15 @@
 
 ; Separate from llvm-frexp.ll test because this errors on half cases
 
+; ARM64EC-LABEL: test_frexp_f16_i32
+; ARM64EC: fcvt d0, h0
+; ARM64EC: bl "#frexp"
+; ARM64EC: fcvt h0, d0
+define { half, i32 } @test_frexp_f16_i32(half %a) {
+  %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
+  ret { half, i32 } %result
+}
+
 ; ARM64EC-LABEL: test_frexp_f32_i32
 ; ARM64EC: fcvt d0, s0
 ; ARM64EC: bl "#frexp"
diff --git a/llvm/test/CodeGen/AArch64/ldexp-arm64ec.ll b/llvm/test/CodeGen/AArch64/ldexp-arm64ec.ll
@@ -3,6 +3,15 @@
 
 ; Separate from ldexp.ll test because this errors on half cases
 
+; ARM64EC-LABEL: ldexp_f16 =
+; ARM64EC: fcvt d0, h0
+; ARM64EC: bl "#ldexp"
+; ARM64EC: fcvt h0, d0
+define half @ldexp_f16(half %val, i32 %a) {
+  %call = call half @llvm.ldexp.f16(half %val, i32 %a)
+  ret half %call
+}
+
 ; ARM64EC-LABEL: ldexp_f32 =
 ; ARM64EC: fcvt d0, s0
 ; ARM64EC: bl "#ldexp"
diff --git a/llvm/test/CodeGen/AArch64/powi-arm64ec.ll b/llvm/test/CodeGen/AArch64/powi-arm64ec.ll
@@ -1,8 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=arm64ec-windows-msvc < %s | FileCheck -check-prefix=ARM64EC %s
 
-declare double @llvm.powi.f64.i32(double, i32)
+declare half @llvm.powi.f16.i32(half, i32)
 declare float @llvm.powi.f32.i32(float, i32)
+declare double @llvm.powi.f64.i32(double, i32)
+
+; ARM64EC-LABEL: powi_f16
+; ARM64EC: fcvt  s0, h0
+; ARM64EC: scvtf s1, w0
+; ARM64EC: bl "#powf"
+define half @powi_f16(half %x, i32 %n) nounwind {
+  %ret = tail call half @llvm.powi.f16.i32(half %x, i32 %n)
+  ret half %ret
+}
 
 ; ARM64EC-LABEL: powi_f32
 ; ARM64EC: scvtf s1, w0
diff --git a/llvm/test/CodeGen/Generic/half.ll b/llvm/test/CodeGen/Generic/half.ll
@@ -7,8 +7,7 @@
 ; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-apple-darwin            | FileCheck %s --check-prefixes=ALL,CHECK %}
 ; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK %}
 ; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK %}
-; FIXME(#94434) unsupported on arm64ec
-; RUN: %if aarch64-registered-target     %{ ! llc %s -o - -mtriple=arm64ec-pc-windows-msvc -filetype=null %}
+; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK %}
 ; RUN: %if amdgpu-registered-target      %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa               | FileCheck %s --check-prefixes=ALL,CHECK %}
 ; RUN: %if arc-registered-target         %{ llc %s -o - -mtriple=arc-elf                         | FileCheck %s --check-prefixes=ALL,CHECK %}
 ; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi       | FileCheck %s --check-prefixes=ALL,CHECK %}
@@ -47,6 +46,8 @@
 ; RUN: %if xcore-registered-target       %{ llc %s -o - -mtriple=xcore-unknown-unknown           | FileCheck %s --check-prefixes=ALL,CHECK %}
 ; RUN: %if xtensa-registered-target      %{ llc %s -o - -mtriple=xtensa-none-elf                 | FileCheck %s --check-prefixes=ALL,CHECK %}
 
+; Note that arm64ec labels are quoted, hence the `{{"?}}:`.
+
 ; Codegen tests don't work the same for graphics targets. Add a dummy directive
 ; for filecheck, just make sure we don't crash.
 ; NOCRASH: {{.*}}
@@ -58,7 +59,7 @@
 ; Regression test for https://github.com/llvm/llvm-project/issues/97981.
 
 define half @from_bits(i16 %bits) nounwind {
-; ALL-LABEL: from_bits:
+; ALL-LABEL: from_bits{{"?}}:
 ; CHECK-NOT: __extend
 ; CHECK-NOT: __trunc
 ; CHECK-NOT: __gnu
@@ -68,7 +69,7 @@ define half @from_bits(i16 %bits) nounwind {
 }
 
 define i16 @to_bits(half %f) nounwind {
-; ALL-LABEL: to_bits:
+; ALL-LABEL: to_bits{{"?}}:
 ; CHECK-NOT: __extend
 ; CHECK-NOT: __trunc
 ; CHECK-NOT: __gnu
@@ -81,7 +82,7 @@ define i16 @to_bits(half %f) nounwind {
 ; https://github.com/llvm/llvm-project/issues/117337 and similar issues.
 
 define half @check_freeze(half %f) nounwind {
-; ALL-LABEL: check_freeze:
+; ALL-LABEL: check_freeze{{"?}}:
   %t0 = freeze half %f
   ret half %t0
 }