@@ -57,6 +57,9 @@ D DemoteTo(T t, float maxColors);
5757template <typename T>
5858T CubicBSpline (T t);
5959
60+ template <typename T>
61+ T LanczosWindow (T x, const T a);
62+
6063#if __arm64__
6164#include < arm_neon.h>
6265
@@ -71,6 +74,58 @@ static inline float32x4_t Cos(const float32x4_t d) {
7174 return vmlaq_f32 (C0, x2, vmlaq_f32 (C1, x2, vmlaq_n_f32 (C2, x2, C3)));
7275}
7376
77+ /* * Sin polynomial coefficients */
78+ constexpr float te_sin_coeff2 = 0 .166666666666f ; // 1/(2*3)
79+ constexpr float te_sin_coeff3 = 0 .05f ; // 1/(4*5)
80+ constexpr float te_sin_coeff4 = 0 .023809523810f ; // 1/(6*7)
81+ constexpr float te_sin_coeff5 = 0 .013888888889f ; // 1/(8*9)
82+
83+ __attribute__ ((always_inline))
84+ inline float32x4_t vsinq_f32(float32x4_t val)
85+ {
86+ const float32x4_t pi_v = vdupq_n_f32 (M_PI);
87+ const float32x4_t pio2_v = vdupq_n_f32 (M_PI / 2 );
88+ const float32x4_t ipi_v = vdupq_n_f32 (1 / M_PI);
89+
90+ // Find positive or negative
91+ const int32x4_t c_v = vabsq_s32 (vcvtq_s32_f32 (vmulq_f32 (val, ipi_v)));
92+ const uint32x4_t sign_v = vcleq_f32 (val, vdupq_n_f32 (0 ));
93+ const uint32x4_t odd_v = vandq_u32 (vreinterpretq_u32_s32 (c_v), vdupq_n_u32 (1 ));
94+
95+ uint32x4_t neg_v = veorq_u32 (odd_v, sign_v);
96+
97+ // Modulus a - (n * int(a*(1/n)))
98+ float32x4_t ma = vsubq_f32 (vabsq_f32 (val), vmulq_f32 (pi_v, vcvtq_f32_s32 (c_v)));
99+ const uint32x4_t reb_v = vcgeq_f32 (ma, pio2_v);
100+
101+ // Rebase a between 0 and pi/2
102+ ma = vbslq_f32 (reb_v, vsubq_f32 (pi_v, ma), ma);
103+
104+ // Taylor series
105+ const float32x4_t ma2 = vmulq_f32 (ma, ma);
106+
107+ // 2nd elem: x^3 / 3!
108+ float32x4_t elem = vmulq_f32 (vmulq_f32 (ma, ma2), vdupq_n_f32 (te_sin_coeff2));
109+ float32x4_t res = vsubq_f32 (ma, elem);
110+
111+ // 3rd elem: x^5 / 5!
112+ elem = vmulq_f32 (vmulq_f32 (elem, ma2), vdupq_n_f32 (te_sin_coeff3));
113+ res = vaddq_f32 (res, elem);
114+
115+ // 4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
116+ elem = vmulq_f32 (vmulq_f32 (elem, ma2), vdupq_n_f32 (te_sin_coeff4));
117+ res = vsubq_f32 (res, elem);
118+
119+ // 5th elem: x^9 / 9!
120+ elem = vmulq_f32 (vmulq_f32 (elem, ma2), vdupq_n_f32 (te_sin_coeff5));
121+ res = vaddq_f32 (res, elem);
122+
123+ // Change of sign
124+ neg_v = vshlq_n_u32 (neg_v, 31 );
125+ res = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (res), neg_v));
126+ return res;
127+ }
128+
74129__attribute__ ((always_inline))
75130static inline float32x4_t FastSin(const float32x4_t v) {
76131 constexpr float A = 4 .0f /(M_PI*M_PI);
@@ -87,7 +142,7 @@ static inline float32x4_t FastSin(const float32x4_t v) {
87142__attribute__ ((always_inline))
88143static inline float32x4_t Sinc(const float32x4_t v) {
89144 const float32x4_t zeros = vdupq_n_f32 (0 );
90- const float32x4_t ones = vdupq_n_f32 (0 );
145+ const float32x4_t ones = vdupq_n_f32 (1 );
91146 uint32x4_t mask = vceqq_f32 (v, zeros);
92147 // if < 0 then set to 1
93148 float32x4_t x = vbslq_f32 (mask, ones, v);
@@ -104,20 +159,22 @@ static inline float32x4_t LanczosWindow(const float32x4_t v, const float a) {
104159 const float32x4_t zeros = vdupq_n_f32 (0 );
105160 uint32x4_t mask = vcltq_f32 (vabsq_f32 (v), fullLength);
106161 float32x4_t rv = vmulq_n_f32 (v, M_PI);
107- float32x4_t x = vmulq_f32 (Sinc (rv), Sinc (vmulq_f32 (v , invLength)));
162+ float32x4_t x = vmulq_f32 (Sinc (rv), Sinc (vmulq_f32 (rv , invLength)));
108163 x = vbslq_f32 (mask, x, zeros);
109164 return x;
110165}
111166
112167__attribute__ ((always_inline))
113168static inline float32x4_t HannWindow(const float32x4_t d, const float length) {
114- const float32x4_t fullLength = vrecpeq_f32 (vdupq_n_f32 (length));
115- const float32x4_t halfLength = vdupq_n_f32 (length / 2 );
169+ const float doublePi = 2 * M_PI;
170+ const float32x4_t wndSize = vdupq_n_f32 (length * 2 - 1 );
171+ const float32x4_t rWndSize = vrecpeq_f32 (wndSize);
172+ uint32x4_t mask = vcgtq_f32 (vabsq_f32 (d), wndSize);
173+ const float32x4_t ones = vdupq_n_f32 (1 );
174+ float32x4_t cx = vmulq_n_f32 (vsubq_f32 (ones, Cos (vmulq_f32 (vmulq_n_f32 (d, doublePi), rWndSize))), 0 .5f );
116175 const float32x4_t zeros = vdupq_n_f32 (0 );
117- uint32x4_t mask = vcltq_f32 (vabsq_f32 (d), halfLength);
118- float32x4_t cx = Cos (vmulq_f32 (vmulq_n_f32 (d, M_PI), fullLength));
119- cx = vmulq_f32 (vmulq_f32 (cx, cx), fullLength);
120- return vbslq_f32 (mask, zeros, cx);
176+ cx = vbslq_f32 (mask, zeros, cx);
177+ return cx;
121178}
122179
123180__attribute__ ((always_inline))
@@ -220,9 +277,6 @@ T MitchellNetravali(T d, T p0, T p1, T p2, T p3);
220277template <typename T>
221278T sinc (T x);
222279
223- template <typename T>
224- T LanczosWindow (T x, const T a);
225-
226280template <typename T>
227281T HannWindow (T x, const T length);
228282
0 commit comments