@@ -60,17 +60,68 @@ T CubicBSpline(T t);
6060#if __arm64__
6161#include < arm_neon.h>
6262
63- inline float32x4_t Cos (const float32x4_t d) {
63+ __attribute__ ((always_inline))
64+ static inline float32x4_t Cos(const float32x4_t d) {
6465
65- constexpr float C0 = 0.99940307 ;
66- constexpr float C1 = -0.49558072 ;
67- constexpr float C2 = 0.03679168 ;
66+ const float32x4_t C0 = vdupq_n_f32 ( 0.99940307 ) ;
67+ const float32x4_t C1 = vdupq_n_f32 ( -0.49558072 ) ;
68+ const float32x4_t C2 = vdupq_n_f32 ( 0.03679168 ) ;
6869 constexpr float C3 = -0.00434102 ;
6970 float32x4_t x2 = vmulq_f32 (d, d);
70- return vmlaq_f32 (vdupq_n_f32 (C0) , x2, vmlaq_f32 (vdupq_n_f32 (C1) , x2, vmlaq_f32 ( vdupq_n_f32 (C2) , x2, vdupq_n_f32 (C3) )));
71+ return vmlaq_f32 (C0 , x2, vmlaq_f32 (C1 , x2, vmlaq_n_f32 (C2 , x2, C3 )));
7172}
7273
73- inline float32x4_t CubicInterpolation (const float32x4_t d,
74+ __attribute__ ((always_inline))
75+ static inline float32x4_t FastSin(const float32x4_t v) {
76+ constexpr float A = 4 .0f /(M_PI*M_PI);
77+ const float32x4_t P = vdupq_n_f32 (0 .1952403377008734f );
78+ const float32x4_t Q = vdupq_n_f32 (0 .01915214119105392f );
79+ const float32x4_t N_PI = vdupq_n_f32 (M_PI);
80+
81+ float32x4_t y = vmulq_f32 (vmulq_n_f32 (v, A), vsubq_f32 (N_PI, v));
82+
83+ const float32x4_t fract = vsubq_f32 (vsubq_f32 (vdupq_n_f32 (1 .0f ), P), Q);
84+ return vmulq_f32 (y, vmlaq_f32 (fract, y, vmlaq_f32 (P, y, Q)));
85+ }
86+
87+ __attribute__ ((always_inline))
88+ static inline float32x4_t Sinc(const float32x4_t v) {
89+ const float32x4_t zeros = vdupq_n_f32 (0 );
90+ const float32x4_t ones = vdupq_n_f32 (0 );
91+ uint32x4_t mask = vceqq_f32 (v, zeros);
92+ // if < 0 then set to 1
93+ float32x4_t x = vbslq_f32 (mask, ones, v);
94+ x = vmulq_f32 (FastSin (v), vrecpeq_f32 (v));
95+ // elements that were < 0 set to zero
96+ x = vbslq_f32 (mask, zeros, v);
97+ return x;
98+ }
99+
100+ __attribute__ ((always_inline))
101+ static inline float32x4_t LanczosWindow(const float32x4_t v, const float a) {
102+ const float32x4_t fullLength = vdupq_n_f32 (a);
103+ const float32x4_t invLength = vrecpeq_f32 (fullLength);
104+ const float32x4_t zeros = vdupq_n_f32 (0 );
105+ uint32x4_t mask = vcltq_f32 (vabsq_f32 (v), fullLength);
106+ float32x4_t rv = vmulq_n_f32 (v, M_PI);
107+ float32x4_t x = vmulq_f32 (Sinc (rv), Sinc (vmulq_f32 (v, invLength)));
108+ x = vbslq_f32 (mask, zeros, x);
109+ return x;
110+ }
111+
112+ __attribute__ ((always_inline))
113+ static inline float32x4_t HannWindow(const float32x4_t d, const float length) {
114+ const float32x4_t fullLength = vrecpeq_f32 (vdupq_n_f32 (length));
115+ const float32x4_t halfLength = vdupq_n_f32 (length / 2 );
116+ const float32x4_t zeros = vdupq_n_f32 (0 );
117+ uint32x4_t mask = vcltq_f32 (vabsq_f32 (d), halfLength);
118+ float32x4_t cx = Cos (vmulq_f32 (vmulq_n_f32 (d, M_PI), fullLength));
119+ cx = vmulq_f32 (vmulq_f32 (cx, cx), fullLength);
120+ return vbslq_f32 (mask, zeros, cx);
121+ }
122+
123+ __attribute__ ((always_inline))
124+ static inline float32x4_t CubicInterpolation(const float32x4_t d,
74125 const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3,
75126 const float C, const float B) {
76127
@@ -92,17 +143,8 @@ inline float32x4_t CubicInterpolation(const float32x4_t d,
92143 return result;
93144}
94145
95- inline float32x4_t HannWindow (const float32x4_t d, const float length) {
96- float32x4_t x = vabsq_f32 (d);
97- uint32x4_t mask = vcltq_f32 (x, vdupq_n_f32 (length / 2 ));
98-
99- x = Cos (vdivq_f32 (vmulq_f32 (vdupq_n_f32 (M_PI), x), vdupq_n_f32 (length)));
100- x = vmulq_n_f32 (vmulq_f32 (x, x), length / 2 );
101- x = vbslq_f32 (mask, vdupq_n_f32 (0 ), x);
102- return x;
103- }
104-
105- inline float32x4_t CatmullRom (const float32x4_t d,
146+ __attribute__ ((always_inline))
147+ static inline float32x4_t CatmullRom(const float32x4_t d,
106148 const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
107149
108150 float32x4_t x = vabsq_f32 (d);
@@ -119,7 +161,8 @@ inline float32x4_t CatmullRom(const float32x4_t d,
119161 return result;
120162}
121163
122- inline float32x4_t SimpleCubic (const float32x4_t d,
164+ __attribute__ ((always_inline))
165+ static inline float32x4_t SimpleCubic(const float32x4_t d,
123166 const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
124167
125168 float32x4_t duplet = vmulq_f32 (d, d);
@@ -139,17 +182,20 @@ inline float32x4_t SimpleCubic(const float32x4_t d,
139182 return result;
140183}
141184
142- inline float32x4_t MitchellNetravali (float32x4_t d,
185+ __attribute__ ((always_inline))
186+ static inline float32x4_t MitchellNetravali(float32x4_t d,
143187 float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
144188 return CubicInterpolation (d, p0, p1, p2, p3, 1 .0f /3 .0f , 1 .0f /3 .0f );
145189}
146190
147- inline float32x4_t CubicHermite (const float32x4_t d,
191+ __attribute__ ((always_inline))
192+ static inline float32x4_t CubicHermite(const float32x4_t d,
148193 const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
149194 return CubicInterpolation (d, p0, p1, p2, p3, 0 .0f , 0 .0f );
150195}
151196
152- inline float32x4_t CubicBSpline (const float32x4_t d,
197+ __attribute__ ((always_inline))
198+ static inline float32x4_t CubicBSpline(const float32x4_t d,
153199 const float32x4_t p0, const float32x4_t p1, const float32x4_t p2, const float32x4_t p3) {
154200 return CubicInterpolation (d, p0, p1, p2, p3, 0 .0f , 1 .0f );
155201}
0 commit comments