33# Micro - kernels for building a performance - first mindset for 64 - bit ARM (NEON).
44# ----------------------------------------------------------------------------
55
6- . section .text
7- . global i32_add_asm_kernel
6+ #ifdef __APPLE__
7+ #define SYMBOL_NAME(name) _##name // Add underscore on macOS
8+ #else
9+ #define SYMBOL_NAME(name) name // No underscore on GNU - based systems
10+ #endif
11+
12+ .text
13+ . global SYMBOL_NAME(i32_add_asm_kernel)
814
9- . global tops_f64_neon_asm_kernel
10- . global tops_f32_neon_asm_kernel
11- . global tops_f16_neon_asm_kernel
12- . global tops_bf16_neon_asm_kernel
13- . global tops_i8_neon_asm_kernel
14- . global tops_u8_neon_asm_kernel
15+ . global SYMBOL_NAME( tops_f64_neon_asm_kernel)
16+ . global SYMBOL_NAME( tops_f32_neon_asm_kernel)
17+ . global SYMBOL_NAME( tops_f16_neon_asm_kernel)
18+ . global SYMBOL_NAME( tops_bf16_neon_asm_kernel)
19+ . global SYMBOL_NAME( tops_i8_neon_asm_kernel)
20+ . global SYMBOL_NAME( tops_u8_neon_asm_kernel)
1521
1622# ----------------------------------------------------------------------------
1723# Simple function th at adds two 32 - bit integers.
1824# AArch64 ABI: W0 = 'a' , W1 = 'b' . Return in W0.
1925# ----------------------------------------------------------------------------
20- i32_add_asm_kernel:
26+ SYMBOL_NAME( i32_add_asm_kernel) :
2127 add w0 , w0 , w1
2228 ret
2329
@@ -26,7 +32,7 @@ i32_add_asm_kernel:
2632# Each FMLA vD.2d , vN.2d , vM.2d => 2 multiplies + 2 adds = 4 FLOPs.
2733# We'll do 10 instructions => 10 × 4 = 40 FLOPs total , returning 40 in W0.
2834# ----------------------------------------------------------------------------
29- tops_f64_neon_asm_kernel:
35+ SYMBOL_NAME( tops_f64_neon_asm_kernel) :
3036 fmla v0.2d , v1.2d , v2.2d
3137 fmla v3.2d , v4.2d , v5.2d
3238 fmla v6.2d , v7.2d , v8.2d
@@ -47,7 +53,7 @@ tops_f64_neon_asm_kernel:
4753# Let's do 10 instructions => 10 × 8 = 80 FLOPs total.
4854# Return 80 in W0.
4955# ----------------------------------------------------------------------------
50- tops_f32_neon_asm_kernel:
56+ SYMBOL_NAME( tops_f32_neon_asm_kernel) :
5157 fmla v0.4s , v1.4s , v2.4s
5258 fmla v3.4s , v4.4s , v5.4s
5359 fmla v6.4s , v7.4s , v8.4s
@@ -68,7 +74,7 @@ tops_f32_neon_asm_kernel:
6874# Each FMLA vD. 8h , vN. 8h , vM. 8h => 8 multiplies + 8 adds = 16 FLOPs.
6975# We'll do 10 instructions => 160 FLOPs total , returning 160 in W0.
7076# ----------------------------------------------------------------------------
71- tops_f16_neon_asm_kernel:
77+ SYMBOL_NAME( tops_f16_neon_asm_kernel) :
7278 fmla v0. 8h , v1. 8h , v2. 8h
7379 fmla v3. 8h , v4. 8h , v5. 8h
7480 fmla v6. 8h , v7. 8h , v8. 8h
@@ -89,7 +95,7 @@ tops_f16_neon_asm_kernel:
8995# bfmmla vD.4s , vN. 8h , vM. 8h => 8 multiplies + 8 adds = 16 FLOPs.
9096# We'll do 10 instructions => 160 FLOPs total , returning 160 in W0.
9197# ----------------------------------------------------------------------------
92- tops_bf16_neon_asm_kernel:
98+ SYMBOL_NAME( tops_bf16_neon_asm_kernel) :
9399 bfmmla v0.4s , v1. 8h , v2. 8h
94100 bfmmla v3.4s , v4. 8h , v5. 8h
95101 bfmmla v6.4s , v7. 8h , v8. 8h
@@ -110,7 +116,7 @@ tops_bf16_neon_asm_kernel:
110116# sdot vD.4s , vN.16b , vM.16b => 16 multiplies + 16 adds = 32 FLOPs.
111117# We'll do 10 instructions => 320 FLOPs total , returning 320 in W0.
112118# ----------------------------------------------------------------------------
113- tops_i8_neon_asm_kernel:
119+ SYMBOL_NAME( tops_i8_neon_asm_kernel) :
114120 sdot v0.4s , v1.16b , v2.16b
115121 sdot v3.4s , v4.16b , v5.16b
116122 sdot v6.4s , v7.16b , v8.16b
@@ -131,7 +137,7 @@ tops_i8_neon_asm_kernel:
131137# udot vD.4s , vN.16b , vM.16b => 16 multiplies + 16 adds = 32 FLOPs.
132138# We'll do 10 instructions => 320 FLOPs total , returning 320 in W0.
133139# ----------------------------------------------------------------------------
134- tops_u8_neon_asm_kernel:
140+ SYMBOL_NAME( tops_u8_neon_asm_kernel) :
135141 udot v0.4s , v1.16b , v2.16b
136142 udot v3.4s , v4.16b , v5.16b
137143 udot v6.4s , v7.16b , v8.16b
@@ -148,5 +154,7 @@ tops_u8_neon_asm_kernel:
148154
149155# ----------------------------------------------------------------------------
150156# Tell the linker/assembler th at we do NOT need an executable stack:
157+ #ifdef __linux__
151158 . section .note.GNU - stack , "" , @progbits
159+ #endif
152160# ----------------------------------------------------------------------------
0 commit comments