@@ -85,7 +85,14 @@ std::vector<float>& apply_lanczos_filter_fma3(const float *samples, size_t sz, c
8585 if ((index >= 3 ) && (index < avx_stop))
8686 output[i] = horizontal_sum (_mm256_mul_ps (_mm256_loadu_ps (&samples[index - 3 ]), _mm256_load_ps (&kernel.weights [j])));
8787 else
88- output[i] = lanczos_convolve (samples, sz, kernel, index, j);
88+ {
89+ const auto start = index - 3 ;
90+ const auto stop = std::min (index + 5 , (intmax_t )sz);
91+ auto sum = _mm_setzero_ps ();
92+ for (auto k = std::max (start, (intmax_t )0 ); k < stop; ++k)
93+ sum = _mm_fmadd_ss (_mm_load_ss (&samples[k]), _mm_load_ss (&kernel.weights [j + (k - start)]), sum);
94+ output[i] = _mm_cvtss_f32 (sum);
95+ }
8996 }
9097 return output;
9198}
@@ -102,17 +109,25 @@ std::vector<float>& apply_lanczos_filter_fma3(const float *samples, size_t sz, c
102109 for (intmax_t i = 0 , k = 0 , l = 0 ; i < bands; ++i)
103110 {
104111 auto vecsum = _mm256_setzero_ps ();
105- auto fsum = 0 .0f ;
106112 auto count = (intmax_t )band_widths[i];
107113 for (intmax_t j = 0 ; j < count; ++j, ++k, l += step)
108114 {
109115 auto index = (intmax_t )x[k];
110116 if ((index >= 3 ) && (index < avx_stop))
111117 vecsum = _mm256_fmadd_ps (_mm256_loadu_ps (&samples[index - 3 ]), _mm256_load_ps (&kernel.weights [l]), vecsum);
112118 else
113- fsum += lanczos_convolve (samples, sz, kernel, index, l);
119+ {
120+ const auto start = index - 3 ;
121+ const auto stop = std::min (index + 5 , (intmax_t )sz);
122+
123+ // this could be done better with asm, but this'll just have to do
124+ auto sum = _mm_setzero_ps ();
125+ for (auto m = std::max (start, (intmax_t )0 ); m < stop; ++m)
126+ sum = _mm_fmadd_ss (_mm_load_ss (&samples[m]), _mm_load_ss (&kernel.weights [l + (m - start)]), sum);
127+ vecsum = _mm256_insertf128_ps (vecsum, _mm_add_ss (_mm256_castps256_ps128 (vecsum), sum), 0 );
128+ }
114129 }
115- output[i] = (fsum + horizontal_sum (vecsum) ) / count;
130+ output[i] = horizontal_sum (vecsum) / count;
116131 }
117132 return output;
118133}
0 commit comments