From af3bac5f0621a7fd85980eb9cc616a5c5a443bef Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Sat, 3 May 2025 19:24:19 +0200 Subject: [PATCH] [openmp] Add initial ARM64EC support Treat ARM64EC as an AArch64 target for the most part. Since the vararg calling convention differs, add a dedicated implementation based on the AArch64 variant. The differences include: - Only the first four parameters are passed via registers - x4 contains a pointer to the first stack argument - x5 contains the size of arguments passed via the stack - The call is preceded by a call to __os_arm64x_check_icall, which optionally sets up an indirect call via an exit thunk if the target is x86_64 code --- .../runtime/cmake/LibompGetArchitecture.cmake | 4 +- openmp/runtime/src/kmp_platform.h | 9 +- openmp/runtime/src/z_Linux_asm.S | 103 ++++++++++++++++++ 3 files changed, 110 insertions(+), 6 deletions(-) diff --git a/openmp/runtime/cmake/LibompGetArchitecture.cmake b/openmp/runtime/cmake/LibompGetArchitecture.cmake index 81aa700e3b6db..f58b8c6464fb8 100644 --- a/openmp/runtime/cmake/LibompGetArchitecture.cmake +++ b/openmp/runtime/cmake/LibompGetArchitecture.cmake @@ -17,6 +17,8 @@ function(libomp_get_architecture return_arch) set(detect_arch_src_txt " #if defined(__KNC__) #error ARCHITECTURE=mic + #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || defined(_M_ARM64EC) + #error ARCHITECTURE=aarch64 #elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) #error ARCHITECTURE=x86_64 #elif defined(__i386) || defined(__i386__) || defined(__IA32__) || defined(_M_I86) || defined(_M_IX86) || defined(__X86__) || defined(_X86_) @@ -37,8 +39,6 @@ function(libomp_get_architecture return_arch) #error ARCHITECTURE=arm #elif defined(__ARM64_ARCH_8_32__) #error ARCHITECTURE=aarch64_32 - #elif defined(__aarch64__) || defined(_M_ARM64) - #error ARCHITECTURE=aarch64 #elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) #error ARCHITECTURE=ppc64le #elif defined(__powerpc64__) diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index 80afba6c6af2e..d2b8aa96aa8b8 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -132,15 +132,16 @@ #define KMP_ARCH_SPARC 0 #if KMP_OS_WINDOWS -#if defined(_M_AMD64) || defined(__x86_64) -#undef KMP_ARCH_X86_64 -#define KMP_ARCH_X86_64 1 -#elif defined(__aarch64__) || defined(_M_ARM64) +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || \ + defined(_M_ARM64EC) #undef KMP_ARCH_AARCH64 #define KMP_ARCH_AARCH64 1 #elif defined(__arm__) || defined(_M_ARM) #undef KMP_ARCH_ARMV7 #define KMP_ARCH_ARMV7 1 +#elif defined(_M_AMD64) || defined(__x86_64) +#undef KMP_ARCH_X86_64 +#define KMP_ARCH_X86_64 1 #else #undef KMP_ARCH_X86 #define KMP_ARCH_X86 1 diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index 607bfd8e3cb0f..6ae8d1cc1b3ab 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -159,6 +159,7 @@ KMP_PREFIX_UNDERSCORE($0): .globl KMP_PREFIX_UNDERSCORE(\proc) KMP_PREFIX_UNDERSCORE(\proc): .endm + # else // KMP_OS_DARWIN || KMP_OS_WINDOWS # define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols // Format labels so that they don't override function names in gdb's backtraces @@ -1301,6 +1302,106 @@ KMP_LABEL(kmp_no_args): // ' #if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) +#ifdef __arm64ec__ + +//------------------------------------------------------------------------ +// int +// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...), +// int gtid, int tid, +// int argc, void *p_argv[]) { +// (*pkfn)( & gtid, & tid, argv[0], ... ); +// return 1; +// } +// +// parameters: +// x0: pkfn +// w1: gtid +// w2: tid +// w3: argc +// x4: p_argv +// +// locals: +// __gtid: gtid parm pushed on stack so can pass >id to pkfn +// __tid: tid parm pushed on stack so can pass &tid to pkfn +// +// reg temps: +// x8: used as temporary for stack placement calculation +// w9: used as temporary for number of pkfn parms +// x10: used to traverse p_argv array +// x11: used to hold pkfn address +// x12: used as temporary for stack parameters +// x19: used to preserve exit_frame_ptr, callee-save +// +// return: w0 (always 1/TRUE) +// + +__gtid = 4 +__tid = 8 + +// -- Begin __kmp_invoke_microtask +// mark_begin; + .section .text,"xr",discard,"#__kmp_invoke_microtask" + .globl "#__kmp_invoke_microtask" + ALIGN 2 +"#__kmp_invoke_microtask": + stp x29, x30, [sp, #-16]! + mov x29, sp + + mov w9, #1 + add w9, w9, w3, lsr #1 + sub sp, sp, w9, uxtw #4 + mov x8, sp + + mov x11, x0 + str w1, [x29, #-__gtid] + str w2, [x29, #-__tid] + mov w9, w3 + mov x10, x4 + + sub x0, x29, #__gtid + sub x1, x29, #__tid + mov x4, sp + mov w5, #0 + + cbz w9, KMP_LABEL(kmp_1) + ldr x2, [x10] + + sub w9, w9, #1 + cbz w9, KMP_LABEL(kmp_1) + ldr x3, [x10, #8]! + + sub w5, w9, #1 + lsl w5, w5, #3 + +KMP_LABEL(kmp_0): + cbz w9, KMP_LABEL(kmp_1) + ldr x12, [x10, #8]! + str x12, [x8], #8 + sub w9, w9, #1 + b KMP_LABEL(kmp_0) +KMP_LABEL(kmp_1): + adrp x10, $iexit_thunk$cdecl$v$varargs + add x10, x10, :lo12:$iexit_thunk$cdecl$v$varargs + adrp x8, __os_arm64x_check_icall + ldr x8, [x8, :lo12:__os_arm64x_check_icall] + blr x8 + blr x11 + mov w0, #1 + mov sp, x29 + ldp x29, x30, [sp], #16 + ret + + .weak_anti_dep __kmp_invoke_microtask + .set __kmp_invoke_microtask, "#__kmp_invoke_microtask" + + .section .hybmp$x,"yi" + .symidx "#__kmp_invoke_microtask" + .symidx $ientry_thunk$cdecl$i8$i8i8i8i8i8 + .word 1 +// -- End __kmp_invoke_microtask + +#else + //------------------------------------------------------------------------ // int // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...), @@ -1425,6 +1526,8 @@ KMP_LABEL(kmp_1): DEBUG_INFO __kmp_invoke_microtask // -- End __kmp_invoke_microtask +#endif + #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */ #if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM