diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h index 52b1543f3f9..3bc78e353bc 100644 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h +++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h @@ -1,37 +1,44 @@ #ifndef _FHT_H_ #define _FHT_H_ -#include #include +#include #ifdef __cplusplus extern "C" { #endif -int fht_float(float *buf, int log_n); -int fht_double(double *buf, int log_n); -int fht_float_oop(float *in, float *out, int log_n); -int fht_double_oop(double *in, double *out, int log_n); - +int fht_float(float* buf, int log_n); +#ifndef __aarch64__ +int fht_double(double* buf, int log_n); +#endif +int fht_float_oop(float* in, float* out, int log_n); +#ifndef __aarch64__ +int fht_double_oop(double* in, double* out, int log_n); +#endif #ifdef __cplusplus } // extern "C" -static inline int fht(float *buf, int log_n) { - return fht_float(buf, log_n); +static inline int fht(float* buf, int log_n) { + return fht_float(buf, log_n); } -static inline int fht(double *buf, int log_n) { - return fht_double(buf, log_n); +#ifndef __aarch64__ +static inline int fht(double* buf, int log_n) { + return fht_double(buf, log_n); } +#endif -static inline int fht(float *buf, float *out, int log_n) { - return fht_float_oop(buf, out, log_n); +static inline int fht(float* buf, float* out, int log_n) { + return fht_float_oop(buf, out, log_n); } -static inline int fht(double *buf, double *out, int log_n) { - return fht_double_oop(buf, out, log_n); +#ifndef __aarch64__ +static inline int fht(double* buf, double* out, int log_n) { + return fht_double_oop(buf, out, log_n); } +#endif #endif diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h index 3bc332d5672..13ec1086500 100644 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h +++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h @@ -7,6 +7,10 @@ extern "C" { #endif +#ifdef __aarch64__ +#include "fht_neon.c" +#define VECTOR_WIDTH (16u) +#else #ifdef __AVX__ #include "fht_avx.c" #define VECTOR_WIDTH (32u) @@ -14,16 +18,19 @@ extern "C" { #include "fht_sse.c" #define VECTOR_WIDTH (16u) #endif +#endif -int fht_float_oop(float *in, float *out, int log_n) { - fast_copy(out, in, sizeof(float) << log_n); - return fht_float(out, log_n); +int fht_float_oop(float* in, float* out, int log_n) { + fast_copy(out, in, sizeof(float) << log_n); + return fht_float(out, log_n); } -int fht_double_oop(double *in, double *out, int log_n) { - fast_copy(out, in, sizeof(double) << log_n); - return fht_double(out, log_n); +#ifndef __aarch64__ +int fht_double_oop(double* in, double* out, int log_n) { + fast_copy(out, in, sizeof(double) << log_n); + return fht_double(out, log_n); } +#endif #ifdef __cplusplus } // extern "C" diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c new file mode 100644 index 00000000000..3d84ee96195 --- /dev/null +++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c @@ -0,0 +1,3019 @@ +// @generated +#include "fht.h" +static inline void helper_float_1(float* buf); +static inline void helper_float_1(float* buf) { + for (int j = 0; j < 2; j += 2) { + for (int k = 0; k < 1; ++k) { + float u = buf[j + k]; + float v = buf[j + k + 1]; + buf[j + k] = u + v; + buf[j + k + 1] = u - v; + } + } +} +static inline void helper_float_2(float* buf); +static inline void helper_float_2(float* buf) { + for (int j = 0; j < 4; j += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "TRN1 v16.4S, v0.4S, v0.4S\n" + "FNEG v17.4S, v0.4S\n" + "TRN2 v17.4S, v0.4S, v17.4S\n" + "FADD v0.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v0.D[0]\n" + "FNEG v17.4S, v0.4S\n" + "INS v17.D[0], v0.D[1]\n" + "FADD v0.4S, v16.4S, v17.4S\n" + "ST1 {v0.4S}, [%0]\n" ::"r"(buf + j) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } +} +void helper_float_3_recursive(float* buf, int depth); +void helper_float_3_recursive(float* buf, int depth) { + if (depth == 2) { + helper_float_2(buf); + return; + } + if (depth == 3) { + helper_float_3_recursive(buf + 0, 2); + helper_float_3_recursive(buf + 4, 2); + for (int j = 0; j < 8; j += 8) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 4) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_3(float* buf); +void helper_float_3(float* buf) { + helper_float_3_recursive(buf, 3); +} +void helper_float_4_recursive(float* buf, int depth); +void helper_float_4_recursive(float* buf, int depth) { + if (depth == 3) { + helper_float_3(buf); + return; + } + if (depth == 4) { + helper_float_4_recursive(buf + 0, 3); + helper_float_4_recursive(buf + 8, 3); + for (int j = 0; j < 16; j += 16) { + for (int k = 0; k < 8; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 8) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_4(float* buf); +void helper_float_4(float* buf) { + helper_float_4_recursive(buf, 4); +} +void helper_float_5_recursive(float* buf, int depth); +void helper_float_5_recursive(float* buf, int depth) { + if (depth == 4) { + helper_float_4(buf); + return; + } + if (depth == 5) { + helper_float_5_recursive(buf + 0, 4); + helper_float_5_recursive(buf + 16, 4); + for (int j = 0; j < 32; j += 32) { + for (int k = 0; k < 16; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 16) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_5(float* buf); +void helper_float_5(float* buf) { + helper_float_5_recursive(buf, 5); +} +void helper_float_6_recursive(float* buf, int depth); +void helper_float_6_recursive(float* buf, int depth) { + if (depth == 3) { + helper_float_3(buf); + return; + } + if (depth == 6) { + helper_float_6_recursive(buf + 0, 3); + helper_float_6_recursive(buf + 8, 3); + helper_float_6_recursive(buf + 16, 3); + helper_float_6_recursive(buf + 24, 3); + helper_float_6_recursive(buf + 32, 3); + helper_float_6_recursive(buf + 40, 3); + helper_float_6_recursive(buf + 48, 3); + helper_float_6_recursive(buf + 56, 3); + for (int j = 0; j < 64; j += 64) { + for (int k = 0; k < 8; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "LD1 {v4.4S}, [%4]\n" + "LD1 {v5.4S}, [%5]\n" + "LD1 {v6.4S}, [%6]\n" + "LD1 {v7.4S}, [%7]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v20.4S, v4.4S, v5.4S\n" + "FSUB v21.4S, v4.4S, v5.4S\n" + "FADD v22.4S, v6.4S, v7.4S\n" + "FSUB v23.4S, v6.4S, v7.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "FADD v4.4S, v20.4S, v22.4S\n" + "FSUB v6.4S, v20.4S, v22.4S\n" + "FADD v5.4S, v21.4S, v23.4S\n" + "FSUB v7.4S, v21.4S, v23.4S\n" + "FADD v16.4S, v0.4S, v4.4S\n" + "FSUB v20.4S, v0.4S, v4.4S\n" + "FADD v17.4S, v1.4S, v5.4S\n" + "FSUB v21.4S, v1.4S, v5.4S\n" + "FADD v18.4S, v2.4S, v6.4S\n" + "FSUB v22.4S, v2.4S, v6.4S\n" + "FADD v19.4S, v3.4S, v7.4S\n" + "FSUB v23.4S, v3.4S, v7.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" + "ST1 {v18.4S}, [%2]\n" + "ST1 {v19.4S}, [%3]\n" + "ST1 {v20.4S}, [%4]\n" + "ST1 {v21.4S}, [%5]\n" + "ST1 {v22.4S}, [%6]\n" + "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 8), + "r"(buf + j + k + 16), + "r"(buf + j + k + 24), + "r"(buf + j + k + 32), + "r"(buf + j + k + 40), + "r"(buf + j + k + 48), + "r"(buf + j + k + 56) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_6(float* buf); +void helper_float_6(float* buf) { + helper_float_6_recursive(buf, 6); +} +void helper_float_7_recursive(float* buf, int depth); +void helper_float_7_recursive(float* buf, int depth) { + if (depth == 3) { + helper_float_3(buf); + return; + } + if (depth == 7) { + helper_float_7_recursive(buf + 0, 3); + helper_float_7_recursive(buf + 8, 3); + helper_float_7_recursive(buf + 16, 3); + helper_float_7_recursive(buf + 24, 3); + helper_float_7_recursive(buf + 32, 3); + helper_float_7_recursive(buf + 40, 3); + helper_float_7_recursive(buf + 48, 3); + helper_float_7_recursive(buf + 56, 3); + helper_float_7_recursive(buf + 64, 3); + helper_float_7_recursive(buf + 72, 3); + helper_float_7_recursive(buf + 80, 3); + helper_float_7_recursive(buf + 88, 3); + helper_float_7_recursive(buf + 96, 3); + helper_float_7_recursive(buf + 104, 3); + helper_float_7_recursive(buf + 112, 3); + helper_float_7_recursive(buf + 120, 3); + for (int j = 0; j < 128; j += 128) { + for (int k = 0; k < 8; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "LD1 {v4.4S}, [%4]\n" + "LD1 {v5.4S}, [%5]\n" + "LD1 {v6.4S}, [%6]\n" + "LD1 {v7.4S}, [%7]\n" + "LD1 {v8.4S}, [%8]\n" + "LD1 {v9.4S}, [%9]\n" + "LD1 {v10.4S}, [%10]\n" + "LD1 {v11.4S}, [%11]\n" + "LD1 {v12.4S}, [%12]\n" + "LD1 {v13.4S}, [%13]\n" + "LD1 {v14.4S}, [%14]\n" + "LD1 {v15.4S}, [%15]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v20.4S, v4.4S, v5.4S\n" + "FSUB v21.4S, v4.4S, v5.4S\n" + "FADD v22.4S, v6.4S, v7.4S\n" + "FSUB v23.4S, v6.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v9.4S\n" + "FSUB v25.4S, v8.4S, v9.4S\n" + "FADD v26.4S, v10.4S, v11.4S\n" + "FSUB v27.4S, v10.4S, v11.4S\n" + "FADD v28.4S, v12.4S, v13.4S\n" + "FSUB v29.4S, v12.4S, v13.4S\n" + "FADD v30.4S, v14.4S, v15.4S\n" + "FSUB v31.4S, v14.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "FADD v4.4S, v20.4S, v22.4S\n" + "FSUB v6.4S, v20.4S, v22.4S\n" + "FADD v5.4S, v21.4S, v23.4S\n" + "FSUB v7.4S, v21.4S, v23.4S\n" + "FADD v8.4S, v24.4S, v26.4S\n" + "FSUB v10.4S, v24.4S, v26.4S\n" + "FADD v9.4S, v25.4S, v27.4S\n" + "FSUB v11.4S, v25.4S, v27.4S\n" + "FADD v12.4S, v28.4S, v30.4S\n" + "FSUB v14.4S, v28.4S, v30.4S\n" + "FADD v13.4S, v29.4S, v31.4S\n" + "FSUB v15.4S, v29.4S, v31.4S\n" + "FADD v16.4S, v0.4S, v4.4S\n" + "FSUB v20.4S, v0.4S, v4.4S\n" + "FADD v17.4S, v1.4S, v5.4S\n" + "FSUB v21.4S, v1.4S, v5.4S\n" + "FADD v18.4S, v2.4S, v6.4S\n" + "FSUB v22.4S, v2.4S, v6.4S\n" + "FADD v19.4S, v3.4S, v7.4S\n" + "FSUB v23.4S, v3.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v12.4S\n" + "FSUB v28.4S, v8.4S, v12.4S\n" + "FADD v25.4S, v9.4S, v13.4S\n" + "FSUB v29.4S, v9.4S, v13.4S\n" + "FADD v26.4S, v10.4S, v14.4S\n" + "FSUB v30.4S, v10.4S, v14.4S\n" + "FADD v27.4S, v11.4S, v15.4S\n" + "FSUB v31.4S, v11.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v24.4S\n" + "FSUB v8.4S, v16.4S, v24.4S\n" + "FADD v1.4S, v17.4S, v25.4S\n" + "FSUB v9.4S, v17.4S, v25.4S\n" + "FADD v2.4S, v18.4S, v26.4S\n" + "FSUB v10.4S, v18.4S, v26.4S\n" + "FADD v3.4S, v19.4S, v27.4S\n" + "FSUB v11.4S, v19.4S, v27.4S\n" + "FADD v4.4S, v20.4S, v28.4S\n" + "FSUB v12.4S, v20.4S, v28.4S\n" + "FADD v5.4S, v21.4S, v29.4S\n" + "FSUB v13.4S, v21.4S, v29.4S\n" + "FADD v6.4S, v22.4S, v30.4S\n" + "FSUB v14.4S, v22.4S, v30.4S\n" + "FADD v7.4S, v23.4S, v31.4S\n" + "FSUB v15.4S, v23.4S, v31.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" + "ST1 {v4.4S}, [%4]\n" + "ST1 {v5.4S}, [%5]\n" + "ST1 {v6.4S}, [%6]\n" + "ST1 {v7.4S}, [%7]\n" + "ST1 {v8.4S}, [%8]\n" + "ST1 {v9.4S}, [%9]\n" + "ST1 {v10.4S}, [%10]\n" + "ST1 {v11.4S}, [%11]\n" + "ST1 {v12.4S}, [%12]\n" + "ST1 {v13.4S}, [%13]\n" + "ST1 {v14.4S}, [%14]\n" + "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 8), + "r"(buf + j + k + 16), + "r"(buf + j + k + 24), + "r"(buf + j + k + 32), + "r"(buf + j + k + 40), + "r"(buf + j + k + 48), + "r"(buf + j + k + 56), + "r"(buf + j + k + 64), + "r"(buf + j + k + 72), + "r"(buf + j + k + 80), + "r"(buf + j + k + 88), + "r"(buf + j + k + 96), + "r"(buf + j + k + 104), + "r"(buf + j + k + 112), + "r"(buf + j + k + 120) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_7(float* buf); +void helper_float_7(float* buf) { + helper_float_7_recursive(buf, 7); +} +static inline void helper_float_8(float* buf); +static inline void helper_float_8(float* buf) { + for (int j = 0; j < 256; j += 64) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "LD1 {v4.4S}, [%4]\n" + "LD1 {v5.4S}, [%5]\n" + "LD1 {v6.4S}, [%6]\n" + "LD1 {v7.4S}, [%7]\n" + "LD1 {v8.4S}, [%8]\n" + "LD1 {v9.4S}, [%9]\n" + "LD1 {v10.4S}, [%10]\n" + "LD1 {v11.4S}, [%11]\n" + "LD1 {v12.4S}, [%12]\n" + "LD1 {v13.4S}, [%13]\n" + "LD1 {v14.4S}, [%14]\n" + "LD1 {v15.4S}, [%15]\n" + "TRN1 v16.4S, v0.4S, v0.4S\n" + "FNEG v17.4S, v0.4S\n" + "TRN2 v17.4S, v0.4S, v17.4S\n" + "FADD v0.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v1.4S, v1.4S\n" + "FNEG v17.4S, v1.4S\n" + "TRN2 v17.4S, v1.4S, v17.4S\n" + "FADD v1.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v2.4S, v2.4S\n" + "FNEG v17.4S, v2.4S\n" + "TRN2 v17.4S, v2.4S, v17.4S\n" + "FADD v2.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v3.4S, v3.4S\n" + "FNEG v17.4S, v3.4S\n" + "TRN2 v17.4S, v3.4S, v17.4S\n" + "FADD v3.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v4.4S, v4.4S\n" + "FNEG v17.4S, v4.4S\n" + "TRN2 v17.4S, v4.4S, v17.4S\n" + "FADD v4.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v5.4S, v5.4S\n" + "FNEG v17.4S, v5.4S\n" + "TRN2 v17.4S, v5.4S, v17.4S\n" + "FADD v5.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v6.4S, v6.4S\n" + "FNEG v17.4S, v6.4S\n" + "TRN2 v17.4S, v6.4S, v17.4S\n" + "FADD v6.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v7.4S, v7.4S\n" + "FNEG v17.4S, v7.4S\n" + "TRN2 v17.4S, v7.4S, v17.4S\n" + "FADD v7.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v8.4S, v8.4S\n" + "FNEG v17.4S, v8.4S\n" + "TRN2 v17.4S, v8.4S, v17.4S\n" + "FADD v8.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v9.4S, v9.4S\n" + "FNEG v17.4S, v9.4S\n" + "TRN2 v17.4S, v9.4S, v17.4S\n" + "FADD v9.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v10.4S, v10.4S\n" + "FNEG v17.4S, v10.4S\n" + "TRN2 v17.4S, v10.4S, v17.4S\n" + "FADD v10.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v11.4S, v11.4S\n" + "FNEG v17.4S, v11.4S\n" + "TRN2 v17.4S, v11.4S, v17.4S\n" + "FADD v11.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v12.4S, v12.4S\n" + "FNEG v17.4S, v12.4S\n" + "TRN2 v17.4S, v12.4S, v17.4S\n" + "FADD v12.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v13.4S, v13.4S\n" + "FNEG v17.4S, v13.4S\n" + "TRN2 v17.4S, v13.4S, v17.4S\n" + "FADD v13.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v14.4S, v14.4S\n" + "FNEG v17.4S, v14.4S\n" + "TRN2 v17.4S, v14.4S, v17.4S\n" + "FADD v14.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v15.4S, v15.4S\n" + "FNEG v17.4S, v15.4S\n" + "TRN2 v17.4S, v15.4S, v17.4S\n" + "FADD v15.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v0.D[0]\n" + "FNEG v17.4S, v0.4S\n" + "INS v17.D[0], v0.D[1]\n" + "FADD v0.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v1.D[0]\n" + "FNEG v17.4S, v1.4S\n" + "INS v17.D[0], v1.D[1]\n" + "FADD v1.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v2.D[0]\n" + "FNEG v17.4S, v2.4S\n" + "INS v17.D[0], v2.D[1]\n" + "FADD v2.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v3.D[0]\n" + "FNEG v17.4S, v3.4S\n" + "INS v17.D[0], v3.D[1]\n" + "FADD v3.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v4.D[0]\n" + "FNEG v17.4S, v4.4S\n" + "INS v17.D[0], v4.D[1]\n" + "FADD v4.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v5.D[0]\n" + "FNEG v17.4S, v5.4S\n" + "INS v17.D[0], v5.D[1]\n" + "FADD v5.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v6.D[0]\n" + "FNEG v17.4S, v6.4S\n" + "INS v17.D[0], v6.D[1]\n" + "FADD v6.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v7.D[0]\n" + "FNEG v17.4S, v7.4S\n" + "INS v17.D[0], v7.D[1]\n" + "FADD v7.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v8.D[0]\n" + "FNEG v17.4S, v8.4S\n" + "INS v17.D[0], v8.D[1]\n" + "FADD v8.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v9.D[0]\n" + "FNEG v17.4S, v9.4S\n" + "INS v17.D[0], v9.D[1]\n" + "FADD v9.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v10.D[0]\n" + "FNEG v17.4S, v10.4S\n" + "INS v17.D[0], v10.D[1]\n" + "FADD v10.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v11.D[0]\n" + "FNEG v17.4S, v11.4S\n" + "INS v17.D[0], v11.D[1]\n" + "FADD v11.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v12.D[0]\n" + "FNEG v17.4S, v12.4S\n" + "INS v17.D[0], v12.D[1]\n" + "FADD v12.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v13.D[0]\n" + "FNEG v17.4S, v13.4S\n" + "INS v17.D[0], v13.D[1]\n" + "FADD v13.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v14.D[0]\n" + "FNEG v17.4S, v14.4S\n" + "INS v17.D[0], v14.D[1]\n" + "FADD v14.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v15.D[0]\n" + "FNEG v17.4S, v15.4S\n" + "INS v17.D[0], v15.D[1]\n" + "FADD v15.4S, v16.4S, v17.4S\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v20.4S, v4.4S, v5.4S\n" + "FSUB v21.4S, v4.4S, v5.4S\n" + "FADD v22.4S, v6.4S, v7.4S\n" + "FSUB v23.4S, v6.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v9.4S\n" + "FSUB v25.4S, v8.4S, v9.4S\n" + "FADD v26.4S, v10.4S, v11.4S\n" + "FSUB v27.4S, v10.4S, v11.4S\n" + "FADD v28.4S, v12.4S, v13.4S\n" + "FSUB v29.4S, v12.4S, v13.4S\n" + "FADD v30.4S, v14.4S, v15.4S\n" + "FSUB v31.4S, v14.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "FADD v4.4S, v20.4S, v22.4S\n" + "FSUB v6.4S, v20.4S, v22.4S\n" + "FADD v5.4S, v21.4S, v23.4S\n" + "FSUB v7.4S, v21.4S, v23.4S\n" + "FADD v8.4S, v24.4S, v26.4S\n" + "FSUB v10.4S, v24.4S, v26.4S\n" + "FADD v9.4S, v25.4S, v27.4S\n" + "FSUB v11.4S, v25.4S, v27.4S\n" + "FADD v12.4S, v28.4S, v30.4S\n" + "FSUB v14.4S, v28.4S, v30.4S\n" + "FADD v13.4S, v29.4S, v31.4S\n" + "FSUB v15.4S, v29.4S, v31.4S\n" + "FADD v16.4S, v0.4S, v4.4S\n" + "FSUB v20.4S, v0.4S, v4.4S\n" + "FADD v17.4S, v1.4S, v5.4S\n" + "FSUB v21.4S, v1.4S, v5.4S\n" + "FADD v18.4S, v2.4S, v6.4S\n" + "FSUB v22.4S, v2.4S, v6.4S\n" + "FADD v19.4S, v3.4S, v7.4S\n" + "FSUB v23.4S, v3.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v12.4S\n" + "FSUB v28.4S, v8.4S, v12.4S\n" + "FADD v25.4S, v9.4S, v13.4S\n" + "FSUB v29.4S, v9.4S, v13.4S\n" + "FADD v26.4S, v10.4S, v14.4S\n" + "FSUB v30.4S, v10.4S, v14.4S\n" + "FADD v27.4S, v11.4S, v15.4S\n" + "FSUB v31.4S, v11.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v24.4S\n" + "FSUB v8.4S, v16.4S, v24.4S\n" + "FADD v1.4S, v17.4S, v25.4S\n" + "FSUB v9.4S, v17.4S, v25.4S\n" + "FADD v2.4S, v18.4S, v26.4S\n" + "FSUB v10.4S, v18.4S, v26.4S\n" + "FADD v3.4S, v19.4S, v27.4S\n" + "FSUB v11.4S, v19.4S, v27.4S\n" + "FADD v4.4S, v20.4S, v28.4S\n" + "FSUB v12.4S, v20.4S, v28.4S\n" + "FADD v5.4S, v21.4S, v29.4S\n" + "FSUB v13.4S, v21.4S, v29.4S\n" + "FADD v6.4S, v22.4S, v30.4S\n" + "FSUB v14.4S, v22.4S, v30.4S\n" + "FADD v7.4S, v23.4S, v31.4S\n" + "FSUB v15.4S, v23.4S, v31.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" + "ST1 {v4.4S}, [%4]\n" + "ST1 {v5.4S}, [%5]\n" + "ST1 {v6.4S}, [%6]\n" + "ST1 {v7.4S}, [%7]\n" + "ST1 {v8.4S}, [%8]\n" + "ST1 {v9.4S}, [%9]\n" + "ST1 {v10.4S}, [%10]\n" + "ST1 {v11.4S}, [%11]\n" + "ST1 {v12.4S}, [%12]\n" + "ST1 {v13.4S}, [%13]\n" + "ST1 {v14.4S}, [%14]\n" + "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 4), + "r"(buf + j + k + 8), + "r"(buf + j + k + 12), + "r"(buf + j + k + 16), + "r"(buf + j + k + 20), + "r"(buf + j + k + 24), + "r"(buf + j + k + 28), + "r"(buf + j + k + 32), + "r"(buf + j + k + 36), + "r"(buf + j + k + 40), + "r"(buf + j + k + 44), + "r"(buf + j + k + 48), + "r"(buf + j + k + 52), + "r"(buf + j + k + 56), + "r"(buf + j + k + 60) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + for (int j = 0; j < 256; j += 256) { + for (int k = 0; k < 64; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 64), + "r"(buf + j + k + 128), + "r"(buf + j + k + 192) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } +} +void helper_float_9_recursive(float* buf, int depth); +void helper_float_9_recursive(float* buf, int depth) { + if (depth == 8) { + helper_float_8(buf); + return; + } + if (depth == 9) { + helper_float_9_recursive(buf + 0, 8); + helper_float_9_recursive(buf + 256, 8); + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 256) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_9(float* buf); +void helper_float_9(float* buf) { + helper_float_9_recursive(buf, 9); +} +void helper_float_10_recursive(float* buf, int depth); +void helper_float_10_recursive(float* buf, int depth) { + if (depth == 8) { + helper_float_8(buf); + return; + } + if (depth == 10) { + helper_float_10_recursive(buf + 0, 8); + helper_float_10_recursive(buf + 256, 8); + helper_float_10_recursive(buf + 512, 8); + helper_float_10_recursive(buf + 768, 8); + for (int j = 0; j < 1024; j += 1024) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 256), + "r"(buf + j + k + 512), + "r"(buf + j + k + 768) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_10(float* buf); +void helper_float_10(float* buf) { + helper_float_10_recursive(buf, 10); +} +void helper_float_11_recursive(float* buf, int depth); +void helper_float_11_recursive(float* buf, int depth) { + if (depth == 10) { + helper_float_10(buf); + return; + } + if (depth == 11) { + helper_float_11_recursive(buf + 0, 10); + helper_float_11_recursive(buf + 1024, 10); + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 1024; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 1024) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_11(float* buf); +void helper_float_11(float* buf) { + helper_float_11_recursive(buf, 11); +} +void helper_float_12_recursive(float* buf, int depth); +void helper_float_12_recursive(float* buf, int depth) { + if (depth == 10) { + helper_float_10(buf); + return; + } + if (depth == 12) { + helper_float_12_recursive(buf + 0, 10); + helper_float_12_recursive(buf + 1024, 10); + helper_float_12_recursive(buf + 2048, 10); + helper_float_12_recursive(buf + 3072, 10); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 1024; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 1024), + "r"(buf + j + k + 2048), + "r"(buf + j + k + 3072) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_12(float* buf); +void helper_float_12(float* buf) { + helper_float_12_recursive(buf, 12); +} +static inline void helper_float_13(float* buf); +static inline void helper_float_13(float* buf) { + for (int j = 0; j < 8192; j += 64) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "LD1 {v4.4S}, [%4]\n" + "LD1 {v5.4S}, [%5]\n" + "LD1 {v6.4S}, [%6]\n" + "LD1 {v7.4S}, [%7]\n" + "LD1 {v8.4S}, [%8]\n" + "LD1 {v9.4S}, [%9]\n" + "LD1 {v10.4S}, [%10]\n" + "LD1 {v11.4S}, [%11]\n" + "LD1 {v12.4S}, [%12]\n" + "LD1 {v13.4S}, [%13]\n" + "LD1 {v14.4S}, [%14]\n" + "LD1 {v15.4S}, [%15]\n" + "TRN1 v16.4S, v0.4S, v0.4S\n" + "FNEG v17.4S, v0.4S\n" + "TRN2 v17.4S, v0.4S, v17.4S\n" + "FADD v0.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v1.4S, v1.4S\n" + "FNEG v17.4S, v1.4S\n" + "TRN2 v17.4S, v1.4S, v17.4S\n" + "FADD v1.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v2.4S, v2.4S\n" + "FNEG v17.4S, v2.4S\n" + "TRN2 v17.4S, v2.4S, v17.4S\n" + "FADD v2.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v3.4S, v3.4S\n" + "FNEG v17.4S, v3.4S\n" + "TRN2 v17.4S, v3.4S, v17.4S\n" + "FADD v3.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v4.4S, v4.4S\n" + "FNEG v17.4S, v4.4S\n" + "TRN2 v17.4S, v4.4S, v17.4S\n" + "FADD v4.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v5.4S, v5.4S\n" + "FNEG v17.4S, v5.4S\n" + "TRN2 v17.4S, v5.4S, v17.4S\n" + "FADD v5.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v6.4S, v6.4S\n" + "FNEG v17.4S, v6.4S\n" + "TRN2 v17.4S, v6.4S, v17.4S\n" + "FADD v6.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v7.4S, v7.4S\n" + "FNEG v17.4S, v7.4S\n" + "TRN2 v17.4S, v7.4S, v17.4S\n" + "FADD v7.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v8.4S, v8.4S\n" + "FNEG v17.4S, v8.4S\n" + "TRN2 v17.4S, v8.4S, v17.4S\n" + "FADD v8.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v9.4S, v9.4S\n" + "FNEG v17.4S, v9.4S\n" + "TRN2 v17.4S, v9.4S, v17.4S\n" + "FADD v9.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v10.4S, v10.4S\n" + "FNEG v17.4S, v10.4S\n" + "TRN2 v17.4S, v10.4S, v17.4S\n" + "FADD v10.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v11.4S, v11.4S\n" + "FNEG v17.4S, v11.4S\n" + "TRN2 v17.4S, v11.4S, v17.4S\n" + "FADD v11.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v12.4S, v12.4S\n" + "FNEG v17.4S, v12.4S\n" + "TRN2 v17.4S, v12.4S, v17.4S\n" + "FADD v12.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v13.4S, v13.4S\n" + "FNEG v17.4S, v13.4S\n" + "TRN2 v17.4S, v13.4S, v17.4S\n" + "FADD v13.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v14.4S, v14.4S\n" + "FNEG v17.4S, v14.4S\n" + "TRN2 v17.4S, v14.4S, v17.4S\n" + "FADD v14.4S, v16.4S, v17.4S\n" + "TRN1 v16.4S, v15.4S, v15.4S\n" + "FNEG v17.4S, v15.4S\n" + "TRN2 v17.4S, v15.4S, v17.4S\n" + "FADD v15.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v0.D[0]\n" + "FNEG v17.4S, v0.4S\n" + "INS v17.D[0], v0.D[1]\n" + "FADD v0.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v1.D[0]\n" + "FNEG v17.4S, v1.4S\n" + "INS v17.D[0], v1.D[1]\n" + "FADD v1.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v2.D[0]\n" + "FNEG v17.4S, v2.4S\n" + "INS v17.D[0], v2.D[1]\n" + "FADD v2.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v3.D[0]\n" + "FNEG v17.4S, v3.4S\n" + "INS v17.D[0], v3.D[1]\n" + "FADD v3.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v4.D[0]\n" + "FNEG v17.4S, v4.4S\n" + "INS v17.D[0], v4.D[1]\n" + "FADD v4.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v5.D[0]\n" + "FNEG v17.4S, v5.4S\n" + "INS v17.D[0], v5.D[1]\n" + "FADD v5.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v6.D[0]\n" + "FNEG v17.4S, v6.4S\n" + "INS v17.D[0], v6.D[1]\n" + "FADD v6.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v7.D[0]\n" + "FNEG v17.4S, v7.4S\n" + "INS v17.D[0], v7.D[1]\n" + "FADD v7.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v8.D[0]\n" + "FNEG v17.4S, v8.4S\n" + "INS v17.D[0], v8.D[1]\n" + "FADD v8.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v9.D[0]\n" + "FNEG v17.4S, v9.4S\n" + "INS v17.D[0], v9.D[1]\n" + "FADD v9.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v10.D[0]\n" + "FNEG v17.4S, v10.4S\n" + "INS v17.D[0], v10.D[1]\n" + "FADD v10.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v11.D[0]\n" + "FNEG v17.4S, v11.4S\n" + "INS v17.D[0], v11.D[1]\n" + "FADD v11.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v12.D[0]\n" + "FNEG v17.4S, v12.4S\n" + "INS v17.D[0], v12.D[1]\n" + "FADD v12.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v13.D[0]\n" + "FNEG v17.4S, v13.4S\n" + "INS v17.D[0], v13.D[1]\n" + "FADD v13.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v14.D[0]\n" + "FNEG v17.4S, v14.4S\n" + "INS v17.D[0], v14.D[1]\n" + "FADD v14.4S, v16.4S, v17.4S\n" + "DUP v16.2D, v15.D[0]\n" + "FNEG v17.4S, v15.4S\n" + "INS v17.D[0], v15.D[1]\n" + "FADD v15.4S, v16.4S, v17.4S\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v20.4S, v4.4S, v5.4S\n" + "FSUB v21.4S, v4.4S, v5.4S\n" + "FADD v22.4S, v6.4S, v7.4S\n" + "FSUB v23.4S, v6.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v9.4S\n" + "FSUB v25.4S, v8.4S, v9.4S\n" + "FADD v26.4S, v10.4S, v11.4S\n" + "FSUB v27.4S, v10.4S, v11.4S\n" + "FADD v28.4S, v12.4S, v13.4S\n" + "FSUB v29.4S, v12.4S, v13.4S\n" + "FADD v30.4S, v14.4S, v15.4S\n" + "FSUB v31.4S, v14.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "FADD v4.4S, v20.4S, v22.4S\n" + "FSUB v6.4S, v20.4S, v22.4S\n" + "FADD v5.4S, v21.4S, v23.4S\n" + "FSUB v7.4S, v21.4S, v23.4S\n" + "FADD v8.4S, v24.4S, v26.4S\n" + "FSUB v10.4S, v24.4S, v26.4S\n" + "FADD v9.4S, v25.4S, v27.4S\n" + "FSUB v11.4S, v25.4S, v27.4S\n" + "FADD v12.4S, v28.4S, v30.4S\n" + "FSUB v14.4S, v28.4S, v30.4S\n" + "FADD v13.4S, v29.4S, v31.4S\n" + "FSUB v15.4S, v29.4S, v31.4S\n" + "FADD v16.4S, v0.4S, v4.4S\n" + "FSUB v20.4S, v0.4S, v4.4S\n" + "FADD v17.4S, v1.4S, v5.4S\n" + "FSUB v21.4S, v1.4S, v5.4S\n" + "FADD v18.4S, v2.4S, v6.4S\n" + "FSUB v22.4S, v2.4S, v6.4S\n" + "FADD v19.4S, v3.4S, v7.4S\n" + "FSUB v23.4S, v3.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v12.4S\n" + "FSUB v28.4S, v8.4S, v12.4S\n" + "FADD v25.4S, v9.4S, v13.4S\n" + "FSUB v29.4S, v9.4S, v13.4S\n" + "FADD v26.4S, v10.4S, v14.4S\n" + "FSUB v30.4S, v10.4S, v14.4S\n" + "FADD v27.4S, v11.4S, v15.4S\n" + "FSUB v31.4S, v11.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v24.4S\n" + "FSUB v8.4S, v16.4S, v24.4S\n" + "FADD v1.4S, v17.4S, v25.4S\n" + "FSUB v9.4S, v17.4S, v25.4S\n" + "FADD v2.4S, v18.4S, v26.4S\n" + "FSUB v10.4S, v18.4S, v26.4S\n" + "FADD v3.4S, v19.4S, v27.4S\n" + "FSUB v11.4S, v19.4S, v27.4S\n" + "FADD v4.4S, v20.4S, v28.4S\n" + "FSUB v12.4S, v20.4S, v28.4S\n" + "FADD v5.4S, v21.4S, v29.4S\n" + "FSUB v13.4S, v21.4S, v29.4S\n" + "FADD v6.4S, v22.4S, v30.4S\n" + "FSUB v14.4S, v22.4S, v30.4S\n" + "FADD v7.4S, v23.4S, v31.4S\n" + "FSUB v15.4S, v23.4S, v31.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" + "ST1 {v4.4S}, [%4]\n" + "ST1 {v5.4S}, [%5]\n" + "ST1 {v6.4S}, [%6]\n" + "ST1 {v7.4S}, [%7]\n" + "ST1 {v8.4S}, [%8]\n" + "ST1 {v9.4S}, [%9]\n" + "ST1 {v10.4S}, [%10]\n" + "ST1 {v11.4S}, [%11]\n" + "ST1 {v12.4S}, [%12]\n" + "ST1 {v13.4S}, [%13]\n" + "ST1 {v14.4S}, [%14]\n" + "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 4), + "r"(buf + j + k + 8), + "r"(buf + j + k + 12), + "r"(buf + j + k + 16), + "r"(buf + j + k + 20), + "r"(buf + j + k + 24), + "r"(buf + j + k + 28), + "r"(buf + j + k + 32), + "r"(buf + j + k + 36), + "r"(buf + j + k + 40), + "r"(buf + j + k + 44), + "r"(buf + j + k + 48), + "r"(buf + j + k + 52), + "r"(buf + j + k + 56), + "r"(buf + j + k + 60) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + for (int j = 0; j < 8192; j += 1024) { + for (int k = 0; k < 64; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "LD1 {v4.4S}, [%4]\n" + "LD1 {v5.4S}, [%5]\n" + "LD1 {v6.4S}, [%6]\n" + "LD1 {v7.4S}, [%7]\n" + "LD1 {v8.4S}, [%8]\n" + "LD1 {v9.4S}, [%9]\n" + "LD1 {v10.4S}, [%10]\n" + "LD1 {v11.4S}, [%11]\n" + "LD1 {v12.4S}, [%12]\n" + "LD1 {v13.4S}, [%13]\n" + "LD1 {v14.4S}, [%14]\n" + "LD1 {v15.4S}, [%15]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v20.4S, v4.4S, v5.4S\n" + "FSUB v21.4S, v4.4S, v5.4S\n" + "FADD v22.4S, v6.4S, v7.4S\n" + "FSUB v23.4S, v6.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v9.4S\n" + "FSUB v25.4S, v8.4S, v9.4S\n" + "FADD v26.4S, v10.4S, v11.4S\n" + "FSUB v27.4S, v10.4S, v11.4S\n" + "FADD v28.4S, v12.4S, v13.4S\n" + "FSUB v29.4S, v12.4S, v13.4S\n" + "FADD v30.4S, v14.4S, v15.4S\n" + "FSUB v31.4S, v14.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "FADD v4.4S, v20.4S, v22.4S\n" + "FSUB v6.4S, v20.4S, v22.4S\n" + "FADD v5.4S, v21.4S, v23.4S\n" + "FSUB v7.4S, v21.4S, v23.4S\n" + "FADD v8.4S, v24.4S, v26.4S\n" + "FSUB v10.4S, v24.4S, v26.4S\n" + "FADD v9.4S, v25.4S, v27.4S\n" + "FSUB v11.4S, v25.4S, v27.4S\n" + "FADD v12.4S, v28.4S, v30.4S\n" + "FSUB v14.4S, v28.4S, v30.4S\n" + "FADD v13.4S, v29.4S, v31.4S\n" + "FSUB v15.4S, v29.4S, v31.4S\n" + "FADD v16.4S, v0.4S, v4.4S\n" + "FSUB v20.4S, v0.4S, v4.4S\n" + "FADD v17.4S, v1.4S, v5.4S\n" + "FSUB v21.4S, v1.4S, v5.4S\n" + "FADD v18.4S, v2.4S, v6.4S\n" + "FSUB v22.4S, v2.4S, v6.4S\n" + "FADD v19.4S, v3.4S, v7.4S\n" + "FSUB v23.4S, v3.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v12.4S\n" + "FSUB v28.4S, v8.4S, v12.4S\n" + "FADD v25.4S, v9.4S, v13.4S\n" + "FSUB v29.4S, v9.4S, v13.4S\n" + "FADD v26.4S, v10.4S, v14.4S\n" + "FSUB v30.4S, v10.4S, v14.4S\n" + "FADD v27.4S, v11.4S, v15.4S\n" + "FSUB v31.4S, v11.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v24.4S\n" + "FSUB v8.4S, v16.4S, v24.4S\n" + "FADD v1.4S, v17.4S, v25.4S\n" + "FSUB v9.4S, v17.4S, v25.4S\n" + "FADD v2.4S, v18.4S, v26.4S\n" + "FSUB v10.4S, v18.4S, v26.4S\n" + "FADD v3.4S, v19.4S, v27.4S\n" + "FSUB v11.4S, v19.4S, v27.4S\n" + "FADD v4.4S, v20.4S, v28.4S\n" + "FSUB v12.4S, v20.4S, v28.4S\n" + "FADD v5.4S, v21.4S, v29.4S\n" + "FSUB v13.4S, v21.4S, v29.4S\n" + "FADD v6.4S, v22.4S, v30.4S\n" + "FSUB v14.4S, v22.4S, v30.4S\n" + "FADD v7.4S, v23.4S, v31.4S\n" + "FSUB v15.4S, v23.4S, v31.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" + "ST1 {v4.4S}, [%4]\n" + "ST1 {v5.4S}, [%5]\n" + "ST1 {v6.4S}, [%6]\n" + "ST1 {v7.4S}, [%7]\n" + "ST1 {v8.4S}, [%8]\n" + "ST1 {v9.4S}, [%9]\n" + "ST1 {v10.4S}, [%10]\n" + "ST1 {v11.4S}, [%11]\n" + "ST1 {v12.4S}, [%12]\n" + "ST1 {v13.4S}, [%13]\n" + "ST1 {v14.4S}, [%14]\n" + "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 64), + "r"(buf + j + k + 128), + "r"(buf + j + k + 192), + "r"(buf + j + k + 256), + "r"(buf + j + k + 320), + "r"(buf + j + k + 384), + "r"(buf + j + k + 448), + "r"(buf + j + k + 512), + "r"(buf + j + k + 576), + "r"(buf + j + k + 640), + "r"(buf + j + k + 704), + "r"(buf + j + k + 768), + "r"(buf + j + k + 832), + "r"(buf + j + k + 896), + "r"(buf + j + k + 960) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 1024; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "LD1 {v4.4S}, [%4]\n" + "LD1 {v5.4S}, [%5]\n" + "LD1 {v6.4S}, [%6]\n" + "LD1 {v7.4S}, [%7]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v20.4S, v4.4S, v5.4S\n" + "FSUB v21.4S, v4.4S, v5.4S\n" + "FADD v22.4S, v6.4S, v7.4S\n" + "FSUB v23.4S, v6.4S, v7.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "FADD v4.4S, v20.4S, v22.4S\n" + "FSUB v6.4S, v20.4S, v22.4S\n" + "FADD v5.4S, v21.4S, v23.4S\n" + "FSUB v7.4S, v21.4S, v23.4S\n" + "FADD v16.4S, v0.4S, v4.4S\n" + "FSUB v20.4S, v0.4S, v4.4S\n" + "FADD v17.4S, v1.4S, v5.4S\n" + "FSUB v21.4S, v1.4S, v5.4S\n" + "FADD v18.4S, v2.4S, v6.4S\n" + "FSUB v22.4S, v2.4S, v6.4S\n" + "FADD v19.4S, v3.4S, v7.4S\n" + "FSUB v23.4S, v3.4S, v7.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" + "ST1 {v18.4S}, [%2]\n" + "ST1 {v19.4S}, [%3]\n" + "ST1 {v20.4S}, [%4]\n" + "ST1 {v21.4S}, [%5]\n" + "ST1 {v22.4S}, [%6]\n" + "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 1024), + "r"(buf + j + k + 2048), + "r"(buf + j + k + 3072), + "r"(buf + j + k + 4096), + "r"(buf + j + k + 5120), + "r"(buf + j + k + 6144), + "r"(buf + j + k + 7168) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } +} +void helper_float_14_recursive(float* buf, int depth); +void helper_float_14_recursive(float* buf, int depth) { + if (depth == 10) { + helper_float_10(buf); + return; + } + if (depth == 14) { + helper_float_14_recursive(buf + 0, 10); + helper_float_14_recursive(buf + 1024, 10); + helper_float_14_recursive(buf + 2048, 10); + helper_float_14_recursive(buf + 3072, 10); + helper_float_14_recursive(buf + 4096, 10); + helper_float_14_recursive(buf + 5120, 10); + helper_float_14_recursive(buf + 6144, 10); + helper_float_14_recursive(buf + 7168, 10); + helper_float_14_recursive(buf + 8192, 10); + helper_float_14_recursive(buf + 9216, 10); + helper_float_14_recursive(buf + 10240, 10); + helper_float_14_recursive(buf + 11264, 10); + helper_float_14_recursive(buf + 12288, 10); + helper_float_14_recursive(buf + 13312, 10); + helper_float_14_recursive(buf + 14336, 10); + helper_float_14_recursive(buf + 15360, 10); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 1024; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "LD1 {v4.4S}, [%4]\n" + "LD1 {v5.4S}, [%5]\n" + "LD1 {v6.4S}, [%6]\n" + "LD1 {v7.4S}, [%7]\n" + "LD1 {v8.4S}, [%8]\n" + "LD1 {v9.4S}, [%9]\n" + "LD1 {v10.4S}, [%10]\n" + "LD1 {v11.4S}, [%11]\n" + "LD1 {v12.4S}, [%12]\n" + "LD1 {v13.4S}, [%13]\n" + "LD1 {v14.4S}, [%14]\n" + "LD1 {v15.4S}, [%15]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v20.4S, v4.4S, v5.4S\n" + "FSUB v21.4S, v4.4S, v5.4S\n" + "FADD v22.4S, v6.4S, v7.4S\n" + "FSUB v23.4S, v6.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v9.4S\n" + "FSUB v25.4S, v8.4S, v9.4S\n" + "FADD v26.4S, v10.4S, v11.4S\n" + "FSUB v27.4S, v10.4S, v11.4S\n" + "FADD v28.4S, v12.4S, v13.4S\n" + "FSUB v29.4S, v12.4S, v13.4S\n" + "FADD v30.4S, v14.4S, v15.4S\n" + "FSUB v31.4S, v14.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "FADD v4.4S, v20.4S, v22.4S\n" + "FSUB v6.4S, v20.4S, v22.4S\n" + "FADD v5.4S, v21.4S, v23.4S\n" + "FSUB v7.4S, v21.4S, v23.4S\n" + "FADD v8.4S, v24.4S, v26.4S\n" + "FSUB v10.4S, v24.4S, v26.4S\n" + "FADD v9.4S, v25.4S, v27.4S\n" + "FSUB v11.4S, v25.4S, v27.4S\n" + "FADD v12.4S, v28.4S, v30.4S\n" + "FSUB v14.4S, v28.4S, v30.4S\n" + "FADD v13.4S, v29.4S, v31.4S\n" + "FSUB v15.4S, v29.4S, v31.4S\n" + "FADD v16.4S, v0.4S, v4.4S\n" + "FSUB v20.4S, v0.4S, v4.4S\n" + "FADD v17.4S, v1.4S, v5.4S\n" + "FSUB v21.4S, v1.4S, v5.4S\n" + "FADD v18.4S, v2.4S, v6.4S\n" + "FSUB v22.4S, v2.4S, v6.4S\n" + "FADD v19.4S, v3.4S, v7.4S\n" + "FSUB v23.4S, v3.4S, v7.4S\n" + "FADD v24.4S, v8.4S, v12.4S\n" + "FSUB v28.4S, v8.4S, v12.4S\n" + "FADD v25.4S, v9.4S, v13.4S\n" + "FSUB v29.4S, v9.4S, v13.4S\n" + "FADD v26.4S, v10.4S, v14.4S\n" + "FSUB v30.4S, v10.4S, v14.4S\n" + "FADD v27.4S, v11.4S, v15.4S\n" + "FSUB v31.4S, v11.4S, v15.4S\n" + "FADD v0.4S, v16.4S, v24.4S\n" + "FSUB v8.4S, v16.4S, v24.4S\n" + "FADD v1.4S, v17.4S, v25.4S\n" + "FSUB v9.4S, v17.4S, v25.4S\n" + "FADD v2.4S, v18.4S, v26.4S\n" + "FSUB v10.4S, v18.4S, v26.4S\n" + "FADD v3.4S, v19.4S, v27.4S\n" + "FSUB v11.4S, v19.4S, v27.4S\n" + "FADD v4.4S, v20.4S, v28.4S\n" + "FSUB v12.4S, v20.4S, v28.4S\n" + "FADD v5.4S, v21.4S, v29.4S\n" + "FSUB v13.4S, v21.4S, v29.4S\n" + "FADD v6.4S, v22.4S, v30.4S\n" + "FSUB v14.4S, v22.4S, v30.4S\n" + "FADD v7.4S, v23.4S, v31.4S\n" + "FSUB v15.4S, v23.4S, v31.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" + "ST1 {v4.4S}, [%4]\n" + "ST1 {v5.4S}, [%5]\n" + "ST1 {v6.4S}, [%6]\n" + "ST1 {v7.4S}, [%7]\n" + "ST1 {v8.4S}, [%8]\n" + "ST1 {v9.4S}, [%9]\n" + "ST1 {v10.4S}, [%10]\n" + "ST1 {v11.4S}, [%11]\n" + "ST1 {v12.4S}, [%12]\n" + "ST1 {v13.4S}, [%13]\n" + "ST1 {v14.4S}, [%14]\n" + "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 1024), + "r"(buf + j + k + 2048), + "r"(buf + j + k + 3072), + "r"(buf + j + k + 4096), + "r"(buf + j + k + 5120), + "r"(buf + j + k + 6144), + "r"(buf + j + k + 7168), + "r"(buf + j + k + 8192), + "r"(buf + j + k + 9216), + "r"(buf + j + k + 10240), + "r"(buf + j + k + 11264), + "r"(buf + j + k + 12288), + "r"(buf + j + k + 13312), + "r"(buf + j + k + 14336), + "r"(buf + j + k + 15360) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_14(float* buf); +void helper_float_14(float* buf) { + helper_float_14_recursive(buf, 14); +} +void helper_float_15_recursive(float* buf, int depth); +void helper_float_15_recursive(float* buf, int depth) { + if (depth == 13) { + helper_float_13(buf); + return; + } + if (depth == 15) { + helper_float_15_recursive(buf + 0, 13); + helper_float_15_recursive(buf + 8192, 13); + helper_float_15_recursive(buf + 16384, 13); + helper_float_15_recursive(buf + 24576, 13); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 8192; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 8192), + "r"(buf + j + k + 16384), + "r"(buf + j + k + 24576) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_15(float* buf); +void helper_float_15(float* buf) { + helper_float_15_recursive(buf, 15); +} +void helper_float_16_recursive(float* buf, int depth); +void helper_float_16_recursive(float* buf, int depth) { + if (depth == 15) { + helper_float_15(buf); + return; + } + if (depth == 16) { + helper_float_16_recursive(buf + 0, 15); + helper_float_16_recursive(buf + 32768, 15); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 32768; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 32768) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_16(float* buf); +void helper_float_16(float* buf) { + helper_float_16_recursive(buf, 16); +} +void helper_float_17_recursive(float* buf, int depth); +void helper_float_17_recursive(float* buf, int depth) { + if (depth == 15) { + helper_float_15(buf); + return; + } + if (depth == 17) { + helper_float_17_recursive(buf + 0, 15); + helper_float_17_recursive(buf + 32768, 15); + helper_float_17_recursive(buf + 65536, 15); + helper_float_17_recursive(buf + 98304, 15); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 32768; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 32768), + "r"(buf + j + k + 65536), + "r"(buf + j + k + 98304) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_17(float* buf); +void helper_float_17(float* buf) { + helper_float_17_recursive(buf, 17); +} +void helper_float_18_recursive(float* buf, int depth); +void helper_float_18_recursive(float* buf, int depth) { + if (depth == 17) { + helper_float_17(buf); + return; + } + if (depth == 18) { + helper_float_18_recursive(buf + 0, 17); + helper_float_18_recursive(buf + 131072, 17); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 131072) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_18(float* buf); +void helper_float_18(float* buf) { + helper_float_18_recursive(buf, 18); +} +void helper_float_19_recursive(float* buf, int depth); +void helper_float_19_recursive(float* buf, int depth) { + if (depth == 18) { + helper_float_18(buf); + return; + } + if (depth == 19) { + helper_float_19_recursive(buf + 0, 18); + helper_float_19_recursive(buf + 262144, 18); + for (int j = 0; j < 524288; j += 524288) { + for (int k = 0; k < 262144; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 262144) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_19(float* buf); +void helper_float_19(float* buf) { + helper_float_19_recursive(buf, 19); +} +void helper_float_20_recursive(float* buf, int depth); +void helper_float_20_recursive(float* buf, int depth) { + if (depth == 18) { + helper_float_18(buf); + return; + } + if (depth == 20) { + helper_float_20_recursive(buf + 0, 18); + helper_float_20_recursive(buf + 262144, 18); + helper_float_20_recursive(buf + 524288, 18); + helper_float_20_recursive(buf + 786432, 18); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 262144; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 262144), + "r"(buf + j + k + 524288), + "r"(buf + j + k + 786432) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_20(float* buf); +void helper_float_20(float* buf) { + helper_float_20_recursive(buf, 20); +} +void helper_float_21_recursive(float* buf, int depth); +void helper_float_21_recursive(float* buf, int depth) { + if (depth == 20) { + helper_float_20(buf); + return; + } + if (depth == 21) { + helper_float_21_recursive(buf + 0, 20); + helper_float_21_recursive(buf + 1048576, 20); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 1048576) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_21(float* buf); +void helper_float_21(float* buf) { + helper_float_21_recursive(buf, 21); +} +void helper_float_22_recursive(float* buf, int depth); +void helper_float_22_recursive(float* buf, int depth) { + if (depth == 20) { + helper_float_20(buf); + return; + } + if (depth == 22) { + helper_float_22_recursive(buf + 0, 20); + helper_float_22_recursive(buf + 1048576, 20); + helper_float_22_recursive(buf + 2097152, 20); + helper_float_22_recursive(buf + 3145728, 20); + for (int j = 0; j < 4194304; j += 4194304) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 1048576), + "r"(buf + j + k + 2097152), + "r"(buf + j + k + 3145728) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_22(float* buf); +void helper_float_22(float* buf) { + helper_float_22_recursive(buf, 22); +} +void helper_float_23_recursive(float* buf, int depth); +void helper_float_23_recursive(float* buf, int depth) { + if (depth == 22) { + helper_float_22(buf); + return; + } + if (depth == 23) { + helper_float_23_recursive(buf + 0, 22); + helper_float_23_recursive(buf + 4194304, 22); + for (int j = 0; j < 8388608; j += 8388608) { + for (int k = 0; k < 4194304; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 4194304) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_23(float* buf); +void helper_float_23(float* buf) { + helper_float_23_recursive(buf, 23); +} +void helper_float_24_recursive(float* buf, int depth); +void helper_float_24_recursive(float* buf, int depth) { + if (depth == 23) { + helper_float_23(buf); + return; + } + if (depth == 24) { + helper_float_24_recursive(buf + 0, 23); + helper_float_24_recursive(buf + 8388608, 23); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 8388608; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 8388608) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_24(float* buf); +void helper_float_24(float* buf) { + helper_float_24_recursive(buf, 24); +} +void helper_float_25_recursive(float* buf, int depth); +void helper_float_25_recursive(float* buf, int depth) { + if (depth == 23) { + helper_float_23(buf); + return; + } + if (depth == 25) { + helper_float_25_recursive(buf + 0, 23); + helper_float_25_recursive(buf + 8388608, 23); + helper_float_25_recursive(buf + 16777216, 23); + helper_float_25_recursive(buf + 25165824, 23); + for (int j = 0; j < 33554432; j += 33554432) { + for (int k = 0; k < 8388608; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 8388608), + "r"(buf + j + k + 16777216), + "r"(buf + j + k + 25165824) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_25(float* buf); +void helper_float_25(float* buf) { + helper_float_25_recursive(buf, 25); +} +void helper_float_26_recursive(float* buf, int depth); +void helper_float_26_recursive(float* buf, int depth) { + if (depth == 25) { + helper_float_25(buf); + return; + } + if (depth == 26) { + helper_float_26_recursive(buf + 0, 25); + helper_float_26_recursive(buf + 33554432, 25); + for (int j = 0; j < 67108864; j += 67108864) { + for (int k = 0; k < 33554432; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 33554432) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_26(float* buf); +void helper_float_26(float* buf) { + helper_float_26_recursive(buf, 26); +} +void helper_float_27_recursive(float* buf, int depth); +void helper_float_27_recursive(float* buf, int depth) { + if (depth == 26) { + helper_float_26(buf); + return; + } + if (depth == 27) { + helper_float_27_recursive(buf + 0, 26); + helper_float_27_recursive(buf + 67108864, 26); + for (int j = 0; j < 134217728; j += 134217728) { + for (int k = 0; k < 67108864; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 67108864) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_27(float* buf); +void helper_float_27(float* buf) { + helper_float_27_recursive(buf, 27); +} +void helper_float_28_recursive(float* buf, int depth); +void helper_float_28_recursive(float* buf, int depth) { + if (depth == 26) { + helper_float_26(buf); + return; + } + if (depth == 28) { + helper_float_28_recursive(buf + 0, 26); + helper_float_28_recursive(buf + 67108864, 26); + helper_float_28_recursive(buf + 134217728, 26); + helper_float_28_recursive(buf + 201326592, 26); + for (int j = 0; j < 268435456; j += 268435456) { + for (int k = 0; k < 67108864; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "LD1 {v2.4S}, [%2]\n" + "LD1 {v3.4S}, [%3]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "FADD v18.4S, v2.4S, v3.4S\n" + "FSUB v19.4S, v2.4S, v3.4S\n" + "FADD v0.4S, v16.4S, v18.4S\n" + "FSUB v2.4S, v16.4S, v18.4S\n" + "FADD v1.4S, v17.4S, v19.4S\n" + "FSUB v3.4S, v17.4S, v19.4S\n" + "ST1 {v0.4S}, [%0]\n" + "ST1 {v1.4S}, [%1]\n" + "ST1 {v2.4S}, [%2]\n" + "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 67108864), + "r"(buf + j + k + 134217728), + "r"(buf + j + k + 201326592) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_28(float* buf); +void helper_float_28(float* buf) { + helper_float_28_recursive(buf, 28); +} +void helper_float_29_recursive(float* buf, int depth); +void helper_float_29_recursive(float* buf, int depth) { + if (depth == 28) { + helper_float_28(buf); + return; + } + if (depth == 29) { + helper_float_29_recursive(buf + 0, 28); + helper_float_29_recursive(buf + 268435456, 28); + for (int j = 0; j < 536870912; j += 536870912) { + for (int k = 0; k < 268435456; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 268435456) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_29(float* buf); +void helper_float_29(float* buf) { + helper_float_29_recursive(buf, 29); +} +void helper_float_30_recursive(float* buf, int depth); +void helper_float_30_recursive(float* buf, int depth) { + if (depth == 29) { + helper_float_29(buf); + return; + } + if (depth == 30) { + helper_float_30_recursive(buf + 0, 29); + helper_float_30_recursive(buf + 536870912, 29); + for (int j = 0; j < 1073741824; j += 1073741824) { + for (int k = 0; k < 536870912; k += 4) { + __asm__ volatile( + "LD1 {v0.4S}, [%0]\n" + "LD1 {v1.4S}, [%1]\n" + "FADD v16.4S, v0.4S, v1.4S\n" + "FSUB v17.4S, v0.4S, v1.4S\n" + "ST1 {v16.4S}, [%0]\n" + "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0), + "r"(buf + j + k + 536870912) + : "%v0", + "%v1", + "%v2", + "%v3", + "%v4", + "%v5", + "%v6", + "%v7", + "%v8", + "%v9", + "%v10", + "%v11", + "%v12", + "%v13", + "%v14", + "%v15", + "%v16", + "%v17", + "%v18", + "%v19", + "%v20", + "%v21", + "%v22", + "%v23", + "%v24", + "%v25", + "%v26", + "%v27", + "%v28", + "%v29", + "%v30", + "%v31", + "memory"); + } + } + return; + } +} +void helper_float_30(float* buf); +void helper_float_30(float* buf) { + helper_float_30_recursive(buf, 30); +} +int fht_float(float* buf, int log_n) { + if (log_n == 0) { + return 0; + } + if (log_n == 1) { + helper_float_1(buf); + return 0; + } + if (log_n == 2) { + helper_float_2(buf); + return 0; + } + if (log_n == 3) { + helper_float_3(buf); + return 0; + } + if (log_n == 4) { + helper_float_4(buf); + return 0; + } + if (log_n == 5) { + helper_float_5(buf); + return 0; + } + if (log_n == 6) { + helper_float_6(buf); + return 0; + } + if (log_n == 7) { + helper_float_7(buf); + return 0; + } + if (log_n == 8) { + helper_float_8(buf); + return 0; + } + if (log_n == 9) { + helper_float_9(buf); + return 0; + } + if (log_n == 10) { + helper_float_10(buf); + return 0; + } + if (log_n == 11) { + helper_float_11(buf); + return 0; + } + if (log_n == 12) { + helper_float_12(buf); + return 0; + } + if (log_n == 13) { + helper_float_13(buf); + return 0; + } + if (log_n == 14) { + helper_float_14(buf); + return 0; + } + if (log_n == 15) { + helper_float_15(buf); + return 0; + } + if (log_n == 16) { + helper_float_16(buf); + return 0; + } + if (log_n == 17) { + helper_float_17(buf); + return 0; + } + if (log_n == 18) { + helper_float_18(buf); + return 0; + } + if (log_n == 19) { + helper_float_19(buf); + return 0; + } + if (log_n == 20) { + helper_float_20(buf); + return 0; + } + if (log_n == 21) { + helper_float_21(buf); + return 0; + } + if (log_n == 22) { + helper_float_22(buf); + return 0; + } + if (log_n == 23) { + helper_float_23(buf); + return 0; + } + if (log_n == 24) { + helper_float_24(buf); + return 0; + } + if (log_n == 25) { + helper_float_25(buf); + return 0; + } + if (log_n == 26) { + helper_float_26(buf); + return 0; + } + if (log_n == 27) { + helper_float_27(buf); + return 0; + } + if (log_n == 28) { + helper_float_28(buf); + return 0; + } + if (log_n == 29) { + helper_float_29(buf); + return 0; + } + if (log_n == 30) { + helper_float_30(buf); + return 0; + } + return 1; +} diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py index b0d92368b3d..bf3655efda4 100644 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py +++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py @@ -1,7 +1,8 @@ +# Portions Copyright (c) Meta Platforms, Inc. and affiliates. import csv import os -import sys import subprocess +import sys max_log_n = 30 @@ -10,256 +11,386 @@ def is_distinct(l): return len(set(l)) == len(l) -def float_avx_0(register, aux_registers, ident=''): +def float_avx_0(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 4: - raise Exception('float_avx_0 needs at least four auxiliary registers') - res = ident + '"vpermilps $160, %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[0]) - res += ident + '"vpermilps $245, %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[1]) + raise Exception("float_avx_0 needs at least four auxiliary registers") + # given source ABCDEFGH, destination register gets AACCEEGG + res = ident + '"vpermilps $160, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0]) + # given source ABCDEFGH, destination register gets BBDDFFHH + res += ident + '"vpermilps $245, %%%%%s, %%%%%s\\n"\n' % ( + register, + aux_registers[1], + ) + # aux2 <- 0 res += ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[2], aux_registers[2], aux_registers[2]) + aux_registers[2], + aux_registers[2], + aux_registers[2], + ) + # aux3 <- -B -B -D -D -F -F -H -H res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[1], aux_registers[2], aux_registers[3]) + aux_registers[1], + aux_registers[2], + aux_registers[3], + ) + # reg <- (A+B)(A-B)(C+D)(C-D)(E+F)(E-F)(G+H)(G-H) res += ident + '"vaddsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[3], aux_registers[0], register) + aux_registers[3], + aux_registers[0], + register, + ) return res -def float_avx_1(register, aux_registers, ident=''): +def float_avx_1(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 5: - raise Exception('float_avx_1 needs at least five auxiliary registers') - res = ident + '"vpermilps $68, %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[0]) - res += ident + '"vpermilps $238, %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[1]) + raise Exception("float_avx_1 needs at least five auxiliary registers") + # Given source ABCDEFGH, r0 <- ABABEFEF + res = ident + '"vpermilps $68, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0]) + # Given source ABCDEFGH, r1 <- CDCDGHGH + res += ident + '"vpermilps $238, %%%%%s, %%%%%s\\n"\n' % ( + register, + aux_registers[1], + ) + # r2 <- 0 res += ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[2], aux_registers[2], aux_registers[2]) + aux_registers[2], + aux_registers[2], + aux_registers[2], + ) + # r3 <- -C -D -C -D -G -H -G -H res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[1], aux_registers[2], aux_registers[3]) + aux_registers[1], + aux_registers[2], + aux_registers[3], + ) + # r4 <- C D -C -D G H -G -H res += ident + '"vblendps $204, %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[3], aux_registers[1], aux_registers[4]) + aux_registers[3], + aux_registers[1], + aux_registers[4], + ) + # reg <- (A + C) (B + D) (A - C) (B - D) etc. res += ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[0], aux_registers[4], register) + aux_registers[0], + aux_registers[4], + register, + ) return res -def float_avx_2(register, aux_registers, ident=''): +def float_avx_2(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 4: - raise Exception('float_avx_2 needs at least four auxiliary registers') + raise Exception("float_avx_2 needs at least four auxiliary registers") + # r0 <- 0 res = ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[0], aux_registers[0], aux_registers[0]) + aux_registers[0], + aux_registers[0], + aux_registers[0], + ) + # r1 <- -A -B -C -D -E -F -G -H res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - register, aux_registers[0], aux_registers[1]) + register, + aux_registers[0], + aux_registers[1], + ) + # r2 <- ABABEFEF res += ident + '"vperm2f128 $0, %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - register, register, aux_registers[2]) + register, + register, + aux_registers[2], + ) + # r3 <- C D -C -D G H -G -H res += ident + '"vperm2f128 $49, %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[1], register, aux_registers[3]) + aux_registers[1], + register, + aux_registers[3], + ) + # reg <- (A + C) (B + D)(A - C) (B - D) etc. res += ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[2], aux_registers[3], register) + aux_registers[2], + aux_registers[3], + register, + ) return res -def float_avx_3_etc(from_register_0, - from_register_1, - to_register_0, - to_register_1, - ident=''): +def float_avx_3_etc( + from_register_0, from_register_1, to_register_0, to_register_1, ident="" +): if not is_distinct( - [from_register_0, from_register_1, to_register_0, to_register_1]): - raise Exception('four registers must be distinct') + [from_register_0, from_register_1, to_register_0, to_register_1] + ): + raise Exception("four registers must be distinct") res = ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - from_register_1, from_register_0, to_register_0) + from_register_1, + from_register_0, + to_register_0, + ) res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - from_register_1, from_register_0, to_register_1) + from_register_1, + from_register_0, + to_register_1, + ) return res -def double_avx_0(register, aux_registers, ident=''): +def double_avx_0(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 4: - raise Exception('double_avx_0 needs at least four auxiliary registers') - res = ident + '"vpermilpd $0, %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[0]) - res += ident + '"vpermilpd $15, %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[1]) + raise Exception("double_avx_0 needs at least four auxiliary registers") + # r0 <- AACC + res = ident + '"vpermilpd $0, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0]) + # r1 <- BBDD + res += ident + '"vpermilpd $15, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1]) + # r2 <- 0 res += ident + '"vxorpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[2], aux_registers[2], aux_registers[2]) + aux_registers[2], + aux_registers[2], + aux_registers[2], + ) + # r3 <- -B -B -D -D res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[1], aux_registers[2], aux_registers[3]) + aux_registers[1], + aux_registers[2], + aux_registers[3], + ) + # reg <- (A + B)(A - B)(C + D)(C - D) res += ident + '"vaddsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[3], aux_registers[0], register) + aux_registers[3], + aux_registers[0], + register, + ) return res -def double_avx_1(register, aux_registers, ident=''): +def double_avx_1(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 4: - raise Exception('double_avx_1 needs at least four auxiliary registers') + raise Exception("double_avx_1 needs at least four auxiliary registers") + # r0 <- ABAB res = ident + '"vperm2f128 $0, %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - register, register, aux_registers[0]) + register, + register, + aux_registers[0], + ) + # r1 <- 0 res += ident + '"vxorpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[1], aux_registers[1], aux_registers[1]) + aux_registers[1], + aux_registers[1], + aux_registers[1], + ) + # r2 <- -A -B -C -D res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - register, aux_registers[1], aux_registers[2]) + register, + aux_registers[1], + aux_registers[2], + ) + # r3 <- C D -C -D res += ident + '"vperm2f128 $49, %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[2], register, aux_registers[3]) + aux_registers[2], + register, + aux_registers[3], + ) + # reg <- (A + C)(B + D)(A - C)(B - D) res += ident + '"vaddpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - aux_registers[3], aux_registers[0], register) + aux_registers[3], + aux_registers[0], + register, + ) return res -def double_avx_2_etc(from_register_0, - from_register_1, - to_register_0, - to_register_1, - ident=''): +def double_avx_2_etc( + from_register_0, from_register_1, to_register_0, to_register_1, ident="" +): if not is_distinct( - [from_register_0, from_register_1, to_register_0, to_register_1]): - raise Exception('four registers must be distinct') + [from_register_0, from_register_1, to_register_0, to_register_1] + ): + raise Exception("four registers must be distinct") res = ident + '"vaddpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - from_register_1, from_register_0, to_register_0) + from_register_1, + from_register_0, + to_register_0, + ) res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % ( - from_register_1, from_register_0, to_register_1) + from_register_1, + from_register_0, + to_register_1, + ) return res -def float_sse_0(register, aux_registers, ident=''): +def float_sse_0(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 2: - raise Exception('float_sse_0 needs at least two auxiliary registers') + raise Exception("float_sse_0 needs at least two auxiliary registers") res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0]) - res += ident + '"shufps $160, %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], - aux_registers[0]) + res += ident + '"shufps $160, %%%%%s, %%%%%s\\n"\n' % ( + aux_registers[0], + aux_registers[0], + ) res += ident + '"shufps $245, %%%%%s, %%%%%s\\n"\n' % (register, register) - res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], - aux_registers[1]) + res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1]) res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1]) - res += ident + '"addsubps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], - aux_registers[0]) - res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], - register) + res += ident + '"addsubps %%%%%s, %%%%%s\\n"\n' % ( + aux_registers[1], + aux_registers[0], + ) + res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], register) return res -def float_sse_1(register, aux_registers, ident=''): +def float_sse_1(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 4: - raise Exception('float_sse_1 needs at least four auxiliary registers') + raise Exception("float_sse_1 needs at least four auxiliary registers") res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0]) - res += ident + '"shufps $68, %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], - aux_registers[0]) - res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], - aux_registers[1]) - res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[2]) - res += ident + '"shufps $14, %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], - aux_registers[2]) - res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[3]) - res += ident + '"shufps $224, %%%%%s, %%%%%s\\n"\n' % (aux_registers[3], - aux_registers[1]) - res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], - aux_registers[2]) - res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], - aux_registers[2]) - res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[2], - register) + res += ident + '"shufps $68, %%%%%s, %%%%%s\\n"\n' % ( + aux_registers[0], + aux_registers[0], + ) + res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1]) + res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[2]) + res += ident + '"shufps $14, %%%%%s, %%%%%s\\n"\n' % ( + aux_registers[1], + aux_registers[2], + ) + res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[3]) + res += ident + '"shufps $224, %%%%%s, %%%%%s\\n"\n' % ( + aux_registers[3], + aux_registers[1], + ) + res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], aux_registers[2]) + res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[2]) + res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[2], register) return res -def float_sse_2_etc(from_register_0, - from_register_1, - to_register_0, - to_register_1, - ident=''): +def float_sse_2_etc( + from_register_0, from_register_1, to_register_0, to_register_1, ident="" +): if not is_distinct( - [from_register_0, from_register_1, to_register_0, to_register_1]): - raise Exception('four registers must be distinct') - res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, - to_register_0) - res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, - to_register_1) - res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (from_register_1, - to_register_0) - res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (from_register_1, - to_register_1) + [from_register_0, from_register_1, to_register_0, to_register_1] + ): + raise Exception("four registers must be distinct") + res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_0) + res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_1) + res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_0) + res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_1) return res -def double_sse_0(register, aux_registers, ident=''): +def double_sse_0(register, aux_registers, ident=""): if not is_distinct(aux_registers): - raise Exception('auxiliary registers must be distinct') + raise Exception("auxiliary registers must be distinct") if register in aux_registers: - raise Exception( - 'the main register can\'t be one of the auxiliary ones') + raise Exception("the main register can't be one of the auxiliary ones") if len(aux_registers) < 2: - raise Exception('double_sse_0 needs at least two auxiliary registers') + raise Exception("double_sse_0 needs at least two auxiliary registers") res = ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0]) - res += ident + '"haddpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], - aux_registers[0]) - res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (register, - aux_registers[1]) - res += ident + '"hsubpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], - aux_registers[1]) - res += ident + '"blendpd $1, %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], - aux_registers[1]) - res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], - register) + res += ident + '"haddpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], aux_registers[0]) + res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1]) + res += ident + '"hsubpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1]) + res += ident + '"blendpd $1, %%%%%s, %%%%%s\\n"\n' % ( + aux_registers[0], + aux_registers[1], + ) + res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], register) + return res + + +def double_sse_1_etc( + from_register_0, from_register_1, to_register_0, to_register_1, ident="" +): + if not is_distinct( + [from_register_0, from_register_1, to_register_0, to_register_1] + ): + raise Exception("four registers must be distinct") + res = ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_0) + res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_1) + res += ident + '"addpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_0) + res += ident + '"subpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_1) + return res + + +# Given reg = ABCD, return (A+B)(A-B)(C+D)(C-D) +def float_neon_0(register, aux_registers, ident=""): + if not is_distinct(aux_registers): + raise Exception("auxiliary registers must be distinct") + if register in aux_registers: + raise Exception("the main register can't be one of the auxiliary ones") + if len(aux_registers) < 2: + raise Exception("float_neon_0 needs at least two auxiliary registers") + # r0 <- AACC + res = f'{ident}"TRN1 {aux_registers[0]}.4S, {register}.4S, {register}.4S\\n"\n' + # r1 <- -A -B -C -D + res += f'{ident}"FNEG {aux_registers[1]}.4S, {register}.4S\\n"\n' + # r2 <- B (-B) D -D + res += f'{ident}"TRN2 {aux_registers[1]}.4S, {register}.4S, {aux_registers[1]}.4S\\n"\n' + # reg <- (A+B)(A-B)(C+D)(C-D) + res += f'{ident}"FADD {register}.4S, {aux_registers[0]}.4S, {aux_registers[1]}.4S\\n"\n' + + return res + + +# Given reg = ABCD, return (A + C)(B + D)(A - C)(B - D) +def float_neon_1(register, aux_registers, ident=""): + if not is_distinct(aux_registers): + raise Exception("auxiliary registers must be distinct") + if register in aux_registers: + raise Exception("the main register can't be one of the auxiliary ones") + if len(aux_registers) < 2: + raise Exception("float_neon_1 needs at least two auxiliary registers") + # r0 <- ABAB + res = f'{ident}"DUP {aux_registers[0]}.2D, {register}.D[0]\\n"\n' + # r1 <- -A -B -C -D + res += f'{ident}"FNEG {aux_registers[1]}.4S, {register}.4S\\n"\n' + # r1 <- C D -C -D + res += f'{ident}"INS {aux_registers[1]}.D[0], {register}.D[1]\\n"\n' + # reg <- (A + C)(B + D)(A - C)(B - D) + res += f'{ident}"FADD {register}.4S, {aux_registers[0]}.4S, {aux_registers[1]}.4S\\n"\n' + return res -def double_sse_1_etc(from_register_0, - from_register_1, - to_register_0, - to_register_1, - ident=''): +def float_neon_2_etc( + from_register_0, from_register_1, to_register_0, to_register_1, ident="" +): if not is_distinct( - [from_register_0, from_register_1, to_register_0, to_register_1]): - raise Exception('four registers must be distinct') - res = ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, - to_register_0) - res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, - to_register_1) - res += ident + '"addpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, - to_register_0) - res += ident + '"subpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, - to_register_1) + [from_register_0, from_register_1, to_register_0, to_register_1] + ): + raise Exception("four registers must be distinct") + res = f'{ident}"FADD {to_register_0}.4S, {from_register_0}.4S, {from_register_1}.4S\\n"\n' + res += f'{ident}"FSUB {to_register_1}.4S, {from_register_0}.4S, {from_register_1}.4S\\n"\n' return res -def plain_step(type_name, buf_name, log_n, it, ident=''): +def plain_step(type_name, buf_name, log_n, it, ident=""): if log_n <= 0: raise Exception("log_n must be positive") if it < 0: @@ -270,8 +401,7 @@ def plain_step(type_name, buf_name, log_n, it, ident=''): res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << (it + 1)) res += ident + " for (int k = 0; k < %d; ++k) {\n" % (1 << it) res += ident + " %s u = %s[j + k];\n" % (type_name, buf_name) - res += ident + " %s v = %s[j + k + %d];\n" % (type_name, buf_name, - 1 << it) + res += ident + " %s v = %s[j + k + %d];\n" % (type_name, buf_name, 1 << it) res += ident + " %s[j + k] = u + v;\n" % buf_name res += ident + " %s[j + k + %d] = u - v;\n" % (buf_name, 1 << it) res += ident + " }\n" @@ -279,27 +409,34 @@ def plain_step(type_name, buf_name, log_n, it, ident=''): return res -def composite_step(buf_name, - log_n, - from_it, - to_it, - log_w, - registers, - move_instruction, - special_steps, - main_step, - ident=''): +MOVE_INSTRUCTION_USE_NEON = "NEON MOV" + + +def composite_step( + buf_name, + log_n, + from_it, + to_it, + log_w, + registers, + move_instruction, + special_steps, + main_step, + ident="", +): + # HACK: NEON needs different syntax for loads and stores. + use_neon_movs = move_instruction == MOVE_INSTRUCTION_USE_NEON if log_n < log_w: - raise Exception('need at least %d elements' % (1 << log_w)) + raise Exception("need at least %d elements" % (1 << log_w)) num_registers = len(registers) if num_registers % 2 == 1: - raise Exception('odd number of registers: %d' % num_registers) + raise Exception("odd number of registers: %d" % num_registers) num_nontrivial_levels = 0 if to_it > log_w: first_nontrivial = max(from_it, log_w) num_nontrivial_levels = to_it - first_nontrivial if 1 << num_nontrivial_levels > num_registers / 2: - raise Exception('not enough registers') + raise Exception("not enough registers") n = 1 << log_n input_registers = [] output_registers = [] @@ -308,184 +445,285 @@ def composite_step(buf_name, input_registers.append(registers[i]) else: output_registers.append(registers[i]) - clobber = ', '.join(['"%%%s"' % x for x in registers]) + clobber = ", ".join(['"%%%s"' % x for x in registers]) if num_nontrivial_levels == 0: - res = ident + 'for (int j = 0; j < %d; j += %d) {\n' % (n, 1 << log_w) - res += ident + ' __asm__ volatile (\n' - res += ident + ' "%s (%%0), %%%%%s\\n"\n' % (move_instruction, - input_registers[0]) + res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << log_w) + res += ident + " __asm__ volatile (\n" + if use_neon_movs: + res += f'{ident} "LD1 {{{input_registers[0]}.4S}}, [%0]\\n"\n' + else: + res += ident + ' "%s (%%0), %%%%%s\\n"\n' % ( + move_instruction, + input_registers[0], + ) for it in range(from_it, to_it): - res += special_steps[it](input_registers[0], output_registers, - ident + ' ') - res += ident + ' "%s %%%%%s, (%%0)\\n"\n' % (move_instruction, - input_registers[0]) - res += ident + ' :: "r"(%s + j) : %s, "memory"\n' % (buf_name, - clobber) - res += ident + ' );\n' - res += ident + '}\n' + res += special_steps[it]( + input_registers[0], output_registers, ident + " " + ) + if use_neon_movs: + res += f'{ident} "ST1 {{{input_registers[0]}.4S}}, [%0]\\n"\n' + else: + res += ident + ' "%s %%%%%s, (%%0)\\n"\n' % ( + move_instruction, + input_registers[0], + ) + res += ident + ' :: "r"(%s + j) : %s, "memory"\n' % (buf_name, clobber) + res += ident + " );\n" + res += ident + "}\n" return res - res = ident + 'for (int j = 0; j < %d; j += %d) {\n' % (n, 1 << to_it) - res += ident + ' for (int k = 0; k < %d; k += %d) {\n' % ( - 1 << (to_it - num_nontrivial_levels), 1 << log_w) + res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << to_it) + res += ident + " for (int k = 0; k < %d; k += %d) {\n" % ( + 1 << (to_it - num_nontrivial_levels), + 1 << log_w, + ) subcube = [] for l in range(1 << num_nontrivial_levels): - subcube.append('j + k + ' + str(l * (1 << - (to_it - num_nontrivial_levels)))) - res += ident + ' __asm__ volatile (\n' + subcube.append("j + k + " + str(l * (1 << (to_it - num_nontrivial_levels)))) + res += ident + " __asm__ volatile (\n" for l in range(1 << num_nontrivial_levels): - res += ident + ' "%s (%%%d), %%%%%s\\n"\n' % (move_instruction, l, - input_registers[l]) + if use_neon_movs: + res += f'{ident} "LD1 {{{input_registers[l]}.4S}}, [%{l}]\\n"\n' + else: + res += ident + ' "%s (%%%d), %%%%%s\\n"\n' % ( + move_instruction, + l, + input_registers[l], + ) for it in range(from_it, log_w): for ii in range(1 << num_nontrivial_levels): - res += special_steps[it](input_registers[ii], output_registers, - ident + ' ') + res += special_steps[it]( + input_registers[ii], output_registers, ident + " " + ) for it in range(num_nontrivial_levels): for ii in range(0, 1 << num_nontrivial_levels, 1 << (it + 1)): for jj in range(1 << it): - res += main_step(input_registers[ii + jj], - input_registers[ii + jj + (1 << it)], - output_registers[ii + jj], - output_registers[ii + jj + - (1 << it)], ident + ' ') + res += main_step( + input_registers[ii + jj], + input_registers[ii + jj + (1 << it)], + output_registers[ii + jj], + output_registers[ii + jj + (1 << it)], + ident + " ", + ) tmp = input_registers input_registers = output_registers output_registers = tmp for l in range(1 << num_nontrivial_levels): - res += ident + ' "%s %%%%%s, (%%%d)\\n"\n' % ( - move_instruction, input_registers[l], l) + if use_neon_movs: + res += f'{ident} "ST1 {{{input_registers[l]}.4S}}, [%{l}]\\n"\n' + else: + res += ident + ' "%s %%%%%s, (%%%d)\\n"\n' % ( + move_instruction, + input_registers[l], + l, + ) res += ident + ' :: %s : %s, "memory"\n' % ( - ', '.join(['"r"(%s + %s)' % (buf_name, x) for x in subcube]), clobber) - res += ident + ' );\n' - res += ident + ' }\n' - res += ident + '}\n' + ", ".join(['"r"(%s + %s)' % (buf_name, x) for x in subcube]), + clobber, + ) + res += ident + " );\n" + res += ident + " }\n" + res += ident + "}\n" return res -def float_avx_composite_step(buf_name, log_n, from_it, to_it, ident=''): - return composite_step(buf_name, log_n, from_it, to_it, 3, - ['ymm%d' % x for x in range(16)], 'vmovups', - [float_avx_0, float_avx_1, - float_avx_2], float_avx_3_etc, ident) - - -def double_avx_composite_step(buf_name, log_n, from_it, to_it, ident=''): - return composite_step(buf_name, log_n, from_it, to_it, 2, [ - 'ymm%d' % x for x in range(16) - ], 'vmovupd', [double_avx_0, double_avx_1], double_avx_2_etc, ident) - - -def float_sse_composite_step(buf_name, log_n, from_it, to_it, ident=''): - return composite_step(buf_name, log_n, from_it, to_it, 2, - ['xmm%d' % x for x in range(16)], 'movups', - [float_sse_0, float_sse_1], float_sse_2_etc, ident) - - -def double_sse_composite_step(buf_name, log_n, from_it, to_it, ident=''): - return composite_step(buf_name, log_n, from_it, to_it, 1, - ['xmm%d' % x for x in range(16)], 'movupd', - [double_sse_0], double_sse_1_etc, ident) +def float_avx_composite_step(buf_name, log_n, from_it, to_it, ident=""): + return composite_step( + buf_name, + log_n, + from_it, + to_it, + 3, + ["ymm%d" % x for x in range(16)], + "vmovups", + [float_avx_0, float_avx_1, float_avx_2], + float_avx_3_etc, + ident, + ) + + +def double_avx_composite_step(buf_name, log_n, from_it, to_it, ident=""): + return composite_step( + buf_name, + log_n, + from_it, + to_it, + 2, + ["ymm%d" % x for x in range(16)], + "vmovupd", + [double_avx_0, double_avx_1], + double_avx_2_etc, + ident, + ) + + +def float_sse_composite_step(buf_name, log_n, from_it, to_it, ident=""): + return composite_step( + buf_name, + log_n, + from_it, + to_it, + 2, + ["xmm%d" % x for x in range(16)], + "movups", + [float_sse_0, float_sse_1], + float_sse_2_etc, + ident, + ) + + +def double_sse_composite_step(buf_name, log_n, from_it, to_it, ident=""): + return composite_step( + buf_name, + log_n, + from_it, + to_it, + 1, + ["xmm%d" % x for x in range(16)], + "movupd", + [double_sse_0], + double_sse_1_etc, + ident, + ) + + +NEON_VECTOR_REGS = [f"v{x}" for x in range(0, 32)] + + +def float_neon_composite_step(buf_name, log_n, from_it, to_it, ident=""): + return composite_step( + buf_name, + log_n, + from_it, + to_it, + 2, + NEON_VECTOR_REGS, + MOVE_INSTRUCTION_USE_NEON, + [float_neon_0, float_neon_1], + float_neon_2_etc, + ident, + ) def plain_unmerged(type_name, log_n): - signature = "static inline void helper_%s_%d(%s *buf)" % (type_name, log_n, - type_name) - res = '%s;\n' % signature - res += '%s {\n' % signature + signature = "static inline void helper_%s_%d(%s *buf)" % ( + type_name, + log_n, + type_name, + ) + res = "%s;\n" % signature + res += "%s {\n" % signature for i in range(log_n): - res += plain_step(type_name, 'buf', log_n, i, ' ') + res += plain_step(type_name, "buf", log_n, i, " ") res += "}\n" return res def greedy_merged(type_name, log_n, composite_step): try: - composite_step('buf', log_n, 0, 0) + composite_step("buf", log_n, 0, 0) except Exception: - raise Exception('log_n is too small: %d' % log_n) - signature = 'static inline void helper_%s_%d(%s *buf)' % (type_name, log_n, - type_name) - res = '%s;\n' % signature - res += '%s {\n' % signature + raise Exception("log_n is too small: %d" % log_n) + signature = "static inline void helper_%s_%d(%s *buf)" % ( + type_name, + log_n, + type_name, + ) + res = "%s;\n" % signature + res += "%s {\n" % signature cur_it = 0 while cur_it < log_n: cur_to_it = log_n while True: try: - composite_step('buf', log_n, cur_it, cur_to_it) + composite_step("buf", log_n, cur_it, cur_to_it) break - except Exception: + except Exception as e: + print(f"warning: {e}") cur_to_it -= 1 continue - res += composite_step('buf', log_n, cur_it, cur_to_it, ' ') + res += composite_step("buf", log_n, cur_it, cur_to_it, " ") cur_it = cur_to_it - res += '}\n' + res += "}\n" return res def greedy_merged_recursive(type_name, log_n, threshold_step, composite_step): if threshold_step > log_n: - raise Exception('threshold_step must be at most log_n') + raise Exception("threshold_step must be at most log_n") try: - composite_step('buf', threshold_step, 0, 0) + composite_step("buf", threshold_step, 0, 0) except Exception: - raise Exception('threshold_step is too small: %d' % threshold_step) - signature = 'void helper_%s_%d_recursive(%s *buf, int depth)' % (type_name, - log_n, - type_name) - res = '%s;\n' % signature - res += '%s {\n' % signature - res += ' if (depth == %d) {\n' % threshold_step - cur_it = 0 - while cur_it < threshold_step: - cur_to_it = threshold_step - while True: - try: - composite_step('buf', threshold_step, cur_it, cur_to_it) - break - except Exception: - cur_to_it -= 1 - continue - res += composite_step('buf', threshold_step, cur_it, cur_to_it, ' ') - cur_it = cur_to_it - res += ' return;\n' - res += ' }\n' + raise Exception("threshold_step is too small: %d" % threshold_step) + signature = "void helper_%s_%d_recursive(%s *buf, int depth)" % ( + type_name, + log_n, + type_name, + ) + res = "%s;\n" % signature + res += "%s {\n" % signature + res += " if (depth == %d) {\n" % threshold_step + if threshold_step == log_n: + cur_it = 0 + while cur_it < threshold_step: + cur_to_it = threshold_step + while True: + try: + composite_step("buf", threshold_step, cur_it, cur_to_it) + break + except Exception: + cur_to_it -= 1 + continue + res += composite_step("buf", threshold_step, cur_it, cur_to_it, " ") + cur_it = cur_to_it + else: + res += " helper_%s_%d(buf);\n" % (type_name, threshold_step) + + res += " return;\n" + res += " }\n" cur_it = threshold_step while cur_it < log_n: cur_to_it = log_n while True: try: - composite_step('buf', cur_to_it, cur_it, cur_to_it) + composite_step("buf", cur_to_it, cur_it, cur_to_it) break except Exception: cur_to_it -= 1 continue - res += ' if (depth == %d) {\n' % cur_to_it + res += " if (depth == %d) {\n" % cur_to_it for i in range(1 << (cur_to_it - cur_it)): - res += ' helper_%s_%d_recursive(buf + %d, %d);\n' % ( - type_name, log_n, i * (1 << cur_it), cur_it) - res += composite_step('buf', cur_to_it, cur_it, cur_to_it, ' ') - res += ' return;\n' - res += ' }\n' + res += " helper_%s_%d_recursive(buf + %d, %d);\n" % ( + type_name, + log_n, + i * (1 << cur_it), + cur_it, + ) + if cur_to_it < log_n: + res += " helper_%s_%d(buf);" % (type_name, cur_to_it) + else: + res += composite_step("buf", cur_to_it, cur_it, cur_to_it, " ") + res += " return;\n" + res += " }\n" cur_it = cur_to_it - res += '}\n' - signature = 'void helper_%s_%d(%s *buf)' % (type_name, log_n, type_name) - res += '%s;\n' % signature - res += '%s {\n' % signature - res += ' helper_%s_%d_recursive(buf, %d);\n' % (type_name, log_n, log_n) - res += '}\n' + res += "}\n" + signature = "void helper_%s_%d(%s *buf)" % (type_name, log_n, type_name) + res += "%s;\n" % signature + res += "%s {\n" % signature + res += " helper_%s_%d_recursive(buf, %d);\n" % (type_name, log_n, log_n) + res += "}\n" return res def extract_time(data): - cpu_time = float(data['cpu_time']) - time_unit = data['time_unit'] - if time_unit != 'ns': - raise Exception('nanoseconds expected') + cpu_time = float(data["cpu_time"]) + time_unit = data["time_unit"] + if time_unit != "ns": + raise Exception("nanoseconds expected") return cpu_time / 1e9 def get_mean_stddev(): - with open('measurements/output.csv', 'r') as csvfile: + with open("measurements/output.csv", "r") as csvfile: reader = csv.reader(csvfile) first = True for row in reader: @@ -494,102 +732,138 @@ def get_mean_stddev(): first = False else: data = {} - for (x, y) in zip(header, row): + for x, y in zip(header, row): data[x] = y - if data['name'] == 'benchmark_fht_mean': + if data["name"] == "benchmark_fht_mean": mean = extract_time(data) - elif data['name'] == 'benchmark_fht_stddev': + elif data["name"] == "benchmark_fht_stddev": stddev = extract_time(data) return mean def measure_time(code, log_n, type_name, method_name, num_it=3): if num_it % 2 == 0: - raise Exception('even number of runs: %d' % num_it) - with open('measurements/to_run.h', 'w') as output: + raise Exception("even number of runs: %d" % num_it) + with open("measurements/to_run.h", "w") as output: output.write(code) - output.write('const int log_n = %d;\n' % log_n) - signature = 'void run(%s *buf)' % type_name - output.write('%s;\n' % signature) - output.write('%s {\n' % signature) - output.write(' %s(buf);\n' % method_name) - output.write('}\n') - with open('/dev/null', 'wb') as devnull: + output.write("const int log_n = %d;\n" % log_n) + signature = "void run(%s *buf)" % type_name + output.write("%s;\n" % signature) + output.write("%s {\n" % signature) + output.write(" %s(buf);\n" % method_name) + output.write("}\n") + with open("/dev/null", "wb") as devnull: code = subprocess.call( - 'cd measurements && make run_%s' % type_name, - shell=True, - stdout=devnull) + "cd measurements && make run_%s" % type_name, shell=True, stdout=devnull + ) if code != 0: - raise Exception('bad exit code') + raise Exception("bad exit code") code = subprocess.call( - './measurements/run_%s --benchmark_repetitions=%d --benchmark_format=csv > ./measurements/output.csv' + "./measurements/run_%s --benchmark_repetitions=%d --benchmark_format=csv > ./measurements/output.csv" % (type_name, num_it), shell=True, - stderr=devnull) + stderr=devnull, + ) if code != 0: - raise Exception('bad exit code') + raise Exception("bad exit code") return get_mean_stddev() -if __name__ == '__main__': - final_code = '#include "fht.h"\n' +# Configuration parameter; set to False if you want the absolute fastest code without regard to size. +CARE_ABOUT_CODE_SIZE = True + +# When CARE_ABOUT_CODE_SIZE, accept the smallest code that is not slower than +# MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE * the fastest time. +MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE = 1.1 + + +if __name__ == "__main__": + final_code = '// @generated\n#include "fht.h"\n' + code_so_far = "" hall_of_fame = [] - for (type_name, - composite_step_generator) in [('float', float_avx_composite_step), - ('double', double_avx_composite_step)]: + for type_name, composite_step_generator in [("float", float_neon_composite_step)]: for log_n in range(1, max_log_n + 1): - sys.stdout.write('log_n = %d\n' % log_n) + sys.stdout.write("log_n = %d\n" % log_n) times = [] try: - (res, desc) = (greedy_merged(type_name, log_n, - composite_step_generator), - 'greedy_merged') + (res, desc) = ( + greedy_merged(type_name, log_n, composite_step_generator), + "greedy_merged", + ) except Exception: - (res, desc) = (plain_unmerged(type_name, log_n), - 'plain_unmerged') - time = measure_time(res, log_n, type_name, - 'helper_%s_%d' % (type_name, log_n)) - times.append((time, res, desc)) - sys.stdout.write('log_n = %d; iterative; time = %.10e\n' % (log_n, - time)) + (res, desc) = (plain_unmerged(type_name, log_n), "plain_unmerged") + time = measure_time( + code_so_far + res, log_n, type_name, "helper_%s_%d" % (type_name, log_n) + ) + code_size = res.count("\n") + times.append((time, res, code_size, desc)) + sys.stdout.write( + "log_n = %d; iterative; code_size = %d; time = %.10e\n" + % (log_n, code_size, time) + ) for threshold_step in range(1, log_n + 1): try: - res = greedy_merged_recursive(type_name, log_n, - threshold_step, - composite_step_generator) - time = measure_time(res, log_n, type_name, - 'helper_%s_%d' % (type_name, log_n)) + res = greedy_merged_recursive( + type_name, log_n, threshold_step, composite_step_generator + ) + time = measure_time( + code_so_far + res, + log_n, + type_name, + "helper_%s_%d" % (type_name, log_n), + ) + code_size = res.count("\n") times.append( - (time, res, - 'greedy_merged_recursive %d' % threshold_step)) + ( + time, + res, + code_size, + "greedy_merged_recursive %d" % threshold_step, + ) + ) sys.stdout.write( - 'log_n = %d; threshold_step = %d; time = %.10e\n' % - (log_n, threshold_step, time)) - except Exception: - sys.stdout.write('FAIL: %d\n' % threshold_step) - (best_time, best_code, best_desc) = min(times) + "log_n = %d; threshold_step = %d; code_size = %d; time = %.10e\n" + % (log_n, threshold_step, code_size, time) + ) + except Exception as e: + sys.stdout.write(f"FAIL: {threshold_step} ({e})\n") + if CARE_ABOUT_CODE_SIZE: + fastest_time = min(times)[0] + times_by_size = sorted(times, key=lambda x: x[2]) + for x in times_by_size: + if x[0] <= fastest_time * MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE: + smallest_acceptable = x + break + (best_time, best_code, best_code_size, best_desc) = smallest_acceptable + else: + (best_time, best_code, best_code_size, best_desc) = min(times) hall_of_fame.append((type_name, log_n, best_time, best_desc)) final_code += best_code - sys.stdout.write('log_n = %d; best_time = %.10e; %s\n' % - (log_n, best_time, best_desc)) - final_code += 'int fht_%s(%s *buf, int log_n) {\n' % (type_name, - type_name) - final_code += ' if (log_n == 0) {\n' - final_code += ' return 0;\n' - final_code += ' }\n' + code_so_far += best_code + sys.stdout.write( + "log_n = %d; best_time = %.10e; %s\n" % (log_n, best_time, best_desc) + ) + final_code += "int fht_%s(%s *buf, int log_n) {\n" % (type_name, type_name) + final_code += " if (log_n == 0) {\n" + final_code += " return 0;\n" + final_code += " }\n" for i in range(1, max_log_n + 1): - final_code += ' if (log_n == %d) {\n' % i - final_code += ' helper_%s_%d(buf);\n' % (type_name, i) - final_code += ' return 0;\n' - final_code += ' }\n' - final_code += ' return 1;\n' - final_code += '}\n' - with open('fht_avx.c', 'w') as output: + final_code += " if (log_n == %d) {\n" % i + final_code += " helper_%s_%d(buf);\n" % (type_name, i) + final_code += " return 0;\n" + final_code += " }\n" + final_code += " return 1;\n" + final_code += "}\n" + with open("fht_neon.c", "w") as output: output.write(final_code) - sys.stdout.write('hall of fame\n') - with open('hall_of_fame_avx.txt', 'w') as hof: - for (type_name, log_n, best_time, best_desc) in hall_of_fame: - s = 'type_name = %s; log_n = %d; best_time = %.10e; best_desc = %s\n' % ( - type_name, log_n, best_time, best_desc) + sys.stdout.write("hall of fame\n") + with open("hall_of_fame_neon.txt", "w") as hof: + for type_name, log_n, best_time, best_desc in hall_of_fame: + s = "type_name = %s; log_n = %d; best_time = %.10e; best_desc = %s\n" % ( + type_name, + log_n, + best_time, + best_desc, + ) sys.stdout.write(s) hof.write(s) diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt new file mode 100644 index 00000000000..547009956e5 --- /dev/null +++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt @@ -0,0 +1,30 @@ +type_name = float; log_n = 1; best_time = 4.1929000000e-08; best_desc = plain_unmerged +type_name = float; log_n = 2; best_time = 4.1758100000e-08; best_desc = greedy_merged +type_name = float; log_n = 3; best_time = 4.2130400000e-08; best_desc = greedy_merged_recursive 2 +type_name = float; log_n = 4; best_time = 4.1849300000e-08; best_desc = greedy_merged_recursive 3 +type_name = float; log_n = 5; best_time = 4.2931300000e-08; best_desc = greedy_merged_recursive 4 +type_name = float; log_n = 6; best_time = 4.5379000000e-08; best_desc = greedy_merged_recursive 3 +type_name = float; log_n = 7; best_time = 6.4887900000e-08; best_desc = greedy_merged_recursive 3 +type_name = float; log_n = 8; best_time = 1.0970500000e-07; best_desc = greedy_merged +type_name = float; log_n = 9; best_time = 2.2306600000e-07; best_desc = greedy_merged_recursive 8 +type_name = float; log_n = 10; best_time = 4.4169300000e-07; best_desc = greedy_merged_recursive 8 +type_name = float; log_n = 11; best_time = 9.7532700000e-07; best_desc = greedy_merged_recursive 10 +type_name = float; log_n = 12; best_time = 1.9247200000e-06; best_desc = greedy_merged_recursive 10 +type_name = float; log_n = 13; best_time = 3.6199200000e-06; best_desc = greedy_merged +type_name = float; log_n = 14; best_time = 8.4450100000e-06; best_desc = greedy_merged_recursive 10 +type_name = float; log_n = 15; best_time = 1.6781100000e-05; best_desc = greedy_merged_recursive 13 +type_name = float; log_n = 16; best_time = 3.7584000000e-05; best_desc = greedy_merged_recursive 15 +type_name = float; log_n = 17; best_time = 7.6645500000e-05; best_desc = greedy_merged_recursive 15 +type_name = float; log_n = 18; best_time = 1.7394400000e-04; best_desc = greedy_merged_recursive 17 +type_name = float; log_n = 19; best_time = 3.9186900000e-04; best_desc = greedy_merged_recursive 18 +type_name = float; log_n = 20; best_time = 8.0344800000e-04; best_desc = greedy_merged_recursive 18 +type_name = float; log_n = 21; best_time = 1.8539700000e-03; best_desc = greedy_merged_recursive 20 +type_name = float; log_n = 22; best_time = 3.6448200000e-03; best_desc = greedy_merged_recursive 20 +type_name = float; log_n = 23; best_time = 8.4403500000e-03; best_desc = greedy_merged_recursive 22 +type_name = float; log_n = 24; best_time = 1.8726400000e-02; best_desc = greedy_merged_recursive 23 +type_name = float; log_n = 25; best_time = 3.8848300000e-02; best_desc = greedy_merged_recursive 23 +type_name = float; log_n = 26; best_time = 8.6437100000e-02; best_desc = greedy_merged_recursive 25 +type_name = float; log_n = 27; best_time = 1.9369800000e-01; best_desc = greedy_merged_recursive 26 +type_name = float; log_n = 28; best_time = 3.9619200000e-01; best_desc = greedy_merged_recursive 26 +type_name = float; log_n = 29; best_time = 1.0401300000e+00; best_desc = greedy_merged_recursive 28 +type_name = float; log_n = 30; best_time = 2.0733800000e+00; best_desc = greedy_merged_recursive 29