|
| 1 | +/* |
| 2 | + *Copyright Redis Ltd. 2021 - present |
| 3 | + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or |
| 4 | + *the Server Side Public License v1 (SSPLv1). |
| 5 | + */ |
| 6 | + |
| 7 | +#include <arm_neon.h> |
| 8 | + |
| 9 | +inline void InnerProduct_Step(const float16_t *&vec1, const float16_t *&vec2, float16x8_t &acc) { |
| 10 | + // Load half-precision vectors |
| 11 | + float16x8_t v1 = vld1q_f16(vec1); |
| 12 | + float16x8_t v2 = vld1q_f16(vec2); |
| 13 | + vec1 += 8; |
| 14 | + vec2 += 8; |
| 15 | + |
| 16 | + // Multiply and accumulate |
| 17 | + acc = vfmaq_f16(acc, v1, v2); |
| 18 | +} |
| 19 | + |
| 20 | +template <unsigned char residual> // 0..31 |
| 21 | +float FP16_InnerProduct_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { |
| 22 | + const auto *vec1 = static_cast<const float16_t *>(pVect1v); |
| 23 | + const auto *vec2 = static_cast<const float16_t *>(pVect2v); |
| 24 | + const auto *const v1End = vec1 + dimension; |
| 25 | + float16x8_t acc1 = vdupq_n_f16(0.0f); |
| 26 | + float16x8_t acc2 = vdupq_n_f16(0.0f); |
| 27 | + float16x8_t acc3 = vdupq_n_f16(0.0f); |
| 28 | + float16x8_t acc4 = vdupq_n_f16(0.0f); |
| 29 | + |
| 30 | + // First, handle the partial chunk residual |
| 31 | + if constexpr (residual % 8) { |
| 32 | + auto constexpr chunk_residual = residual % 8; |
| 33 | + // TODO: spacial cases for some residuals and benchmark if its better |
| 34 | + constexpr uint16x8_t mask = { |
| 35 | + 0xFFFF, |
| 36 | + (chunk_residual >= 2) ? 0xFFFF : 0, |
| 37 | + (chunk_residual >= 3) ? 0xFFFF : 0, |
| 38 | + (chunk_residual >= 4) ? 0xFFFF : 0, |
| 39 | + (chunk_residual >= 5) ? 0xFFFF : 0, |
| 40 | + (chunk_residual >= 6) ? 0xFFFF : 0, |
| 41 | + (chunk_residual >= 7) ? 0xFFFF : 0, |
| 42 | + 0, |
| 43 | + }; |
| 44 | + |
| 45 | + // Load partial vectors |
| 46 | + float16x8_t v1 = vld1q_f16(vec1); |
| 47 | + float16x8_t v2 = vld1q_f16(vec2); |
| 48 | + |
| 49 | + // Apply mask to both vectors |
| 50 | + float16x8_t masked_v1 = vbslq_f16(mask, v1, acc1); // `acc1` should be all zeros here |
| 51 | + float16x8_t masked_v2 = vbslq_f16(mask, v2, acc2); // `acc2` should be all zeros here |
| 52 | + |
| 53 | + // Multiply and accumulate |
| 54 | + acc1 = vfmaq_f16(acc1, masked_v1, masked_v2); |
| 55 | + |
| 56 | + // Advance pointers |
| 57 | + vec1 += chunk_residual; |
| 58 | + vec2 += chunk_residual; |
| 59 | + } |
| 60 | + |
| 61 | + // Handle (residual - (residual % 8)) in chunks of 8 float16 |
| 62 | + if constexpr (residual >= 8) |
| 63 | + InnerProduct_Step(vec1, vec2, acc2); |
| 64 | + if constexpr (residual >= 16) |
| 65 | + InnerProduct_Step(vec1, vec2, acc3); |
| 66 | + if constexpr (residual >= 24) |
| 67 | + InnerProduct_Step(vec1, vec2, acc4); |
| 68 | + |
| 69 | + // Process the rest of the vectors (the full chunks part) |
| 70 | + while (vec1 < v1End) { |
| 71 | + // TODO: use `vld1q_f16_x4` for quad-loading? |
| 72 | + InnerProduct_Step(vec1, vec2, acc1); |
| 73 | + InnerProduct_Step(vec1, vec2, acc2); |
| 74 | + InnerProduct_Step(vec1, vec2, acc3); |
| 75 | + InnerProduct_Step(vec1, vec2, acc4); |
| 76 | + } |
| 77 | + |
| 78 | + // Accumulate accumulators |
| 79 | + acc1 = vpaddq_f16(acc1, acc3); |
| 80 | + acc2 = vpaddq_f16(acc2, acc4); |
| 81 | + acc1 = vpaddq_f16(acc1, acc2); |
| 82 | + |
| 83 | + // Horizontal sum of the accumulated values |
| 84 | + float32x4_t sum_f32 = vcvt_f32_f16(vget_low_f16(acc1)); |
| 85 | + sum_f32 = vaddq_f32(sum_f32, vcvt_f32_f16(vget_high_f16(acc1))); |
| 86 | + |
| 87 | + // Pairwise add to get horizontal sum |
| 88 | + float32x2_t sum_2 = vadd_f32(vget_low_f32(sum_f32), vget_high_f32(sum_f32)); |
| 89 | + sum_2 = vpadd_f32(sum_2, sum_2); |
| 90 | + |
| 91 | + // Extract result |
| 92 | + return 1.0f - vget_lane_f32(sum_2, 0); |
| 93 | +} |
0 commit comments