|
| 1 | +/* |
| 2 | + *Copyright Redis Ltd. 2021 - present |
| 3 | + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or |
| 4 | + *the Server Side Public License v1 (SSPLv1). |
| 5 | + */ |
| 6 | + |
| 7 | +#include <arm_neon.h> |
| 8 | + |
| 9 | +// Assumes little-endianess |
| 10 | +inline void L2Sqr_Op(float32x4_t &acc, bfloat16x8_t &v1, bfloat16x8_t &v2) { |
| 11 | + float32x4_t v1_lo = vcvtq_low_f32_bf16(v1); |
| 12 | + float32x4_t v2_lo = vcvtq_low_f32_bf16(v2); |
| 13 | + float32x4_t diff_lo = vsubq_f32(v1_lo, v2_lo); |
| 14 | + |
| 15 | + acc = vfmaq_f32(acc, diff_lo, diff_lo); |
| 16 | + |
| 17 | + float32x4_t v1_hi = vcvtq_high_f32_bf16(v1); |
| 18 | + float32x4_t v2_hi = vcvtq_high_f32_bf16(v2); |
| 19 | + float32x4_t diff_hi = vsubq_f32(v1_hi, v2_hi); |
| 20 | + |
| 21 | + acc = vfmaq_f32(acc, diff_hi, diff_hi); |
| 22 | +} |
| 23 | + |
| 24 | +inline void L2Sqr_Step(const bfloat16_t *&vec1, const bfloat16_t *&vec2, float32x4_t &acc) { |
| 25 | + // Load brain-half-precision vectors |
| 26 | + bfloat16x8_t v1 = vld1q_bf16(vec1); |
| 27 | + bfloat16x8_t v2 = vld1q_bf16(vec2); |
| 28 | + vec1 += 8; |
| 29 | + vec2 += 8; |
| 30 | + L2Sqr_Op(acc, v1, v2); |
| 31 | +} |
| 32 | + |
| 33 | +template <unsigned char residual> // 0..31 |
| 34 | +float BF16_L2Sqr_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { |
| 35 | + const auto *vec1 = static_cast<const bfloat16_t *>(pVect1v); |
| 36 | + const auto *vec2 = static_cast<const bfloat16_t *>(pVect2v); |
| 37 | + const auto *const v1End = vec1 + dimension; |
| 38 | + float32x4_t acc1 = vdupq_n_f32(0.0f); |
| 39 | + float32x4_t acc2 = vdupq_n_f32(0.0f); |
| 40 | + float32x4_t acc3 = vdupq_n_f32(0.0f); |
| 41 | + float32x4_t acc4 = vdupq_n_f32(0.0f); |
| 42 | + |
| 43 | + // First, handle the partial chunk residual |
| 44 | + if constexpr (residual % 8) { |
| 45 | + auto constexpr chunk_residual = residual % 8; |
| 46 | + // TODO: special cases for some residuals and benchmark if its better |
| 47 | + constexpr uint16x8_t mask = { |
| 48 | + 0xFFFF, |
| 49 | + (chunk_residual >= 2) ? 0xFFFF : 0, |
| 50 | + (chunk_residual >= 3) ? 0xFFFF : 0, |
| 51 | + (chunk_residual >= 4) ? 0xFFFF : 0, |
| 52 | + (chunk_residual >= 5) ? 0xFFFF : 0, |
| 53 | + (chunk_residual >= 6) ? 0xFFFF : 0, |
| 54 | + (chunk_residual >= 7) ? 0xFFFF : 0, |
| 55 | + 0, |
| 56 | + }; |
| 57 | + |
| 58 | + // Load partial vectors |
| 59 | + bfloat16x8_t v1 = vld1q_bf16(vec1); |
| 60 | + bfloat16x8_t v2 = vld1q_bf16(vec2); |
| 61 | + |
| 62 | + // Apply mask to both vectors |
| 63 | + bfloat16x8_t masked_v1 = |
| 64 | + vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v1), mask)); |
| 65 | + bfloat16x8_t masked_v2 = |
| 66 | + vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v2), mask)); |
| 67 | + |
| 68 | + L2Sqr_Op(acc1, masked_v1, masked_v2); |
| 69 | + |
| 70 | + // Advance pointers |
| 71 | + vec1 += chunk_residual; |
| 72 | + vec2 += chunk_residual; |
| 73 | + } |
| 74 | + |
| 75 | + // Handle (residual - (residual % 8)) in chunks of 8 bfloat16 |
| 76 | + if constexpr (residual >= 8) |
| 77 | + L2Sqr_Step(vec1, vec2, acc2); |
| 78 | + if constexpr (residual >= 16) |
| 79 | + L2Sqr_Step(vec1, vec2, acc3); |
| 80 | + if constexpr (residual >= 24) |
| 81 | + L2Sqr_Step(vec1, vec2, acc4); |
| 82 | + |
| 83 | + // Process the rest of the vectors (the full chunks part) |
| 84 | + while (vec1 < v1End) { |
| 85 | + // TODO: use `vld1q_f16_x4` for quad-loading? |
| 86 | + L2Sqr_Step(vec1, vec2, acc1); |
| 87 | + L2Sqr_Step(vec1, vec2, acc2); |
| 88 | + L2Sqr_Step(vec1, vec2, acc3); |
| 89 | + L2Sqr_Step(vec1, vec2, acc4); |
| 90 | + } |
| 91 | + |
| 92 | + // Accumulate accumulators |
| 93 | + acc1 = vpaddq_f32(acc1, acc3); |
| 94 | + acc2 = vpaddq_f32(acc2, acc4); |
| 95 | + acc1 = vpaddq_f32(acc1, acc2); |
| 96 | + |
| 97 | + // Pairwise add to get horizontal sum |
| 98 | + float32x2_t folded = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); |
| 99 | + folded = vpadd_f32(folded, folded); |
| 100 | + |
| 101 | + // Extract result |
| 102 | + return vget_lane_f32(folded, 0); |
| 103 | +} |
0 commit comments