Skip to content

Commit 29d906a

Browse files
committed
minor performance tweak
1 parent 4c4d4d8 commit 29d906a

File tree

1 file changed

+19
-4
lines changed

1 file changed

+19
-4
lines changed

src/filter_fma3.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,14 @@ std::vector<float>& apply_lanczos_filter_fma3(const float *samples, size_t sz, c
8585
if((index >= 3) && (index < avx_stop))
8686
output[i] = horizontal_sum(_mm256_mul_ps(_mm256_loadu_ps(&samples[index - 3]), _mm256_load_ps(&kernel.weights[j])));
8787
else
88-
output[i] = lanczos_convolve(samples, sz, kernel, index, j);
88+
{
89+
const auto start = index - 3;
90+
const auto stop = std::min(index + 5, (intmax_t)sz);
91+
auto sum = _mm_setzero_ps();
92+
for(auto k = std::max(start, (intmax_t)0); k < stop; ++k)
93+
sum = _mm_fmadd_ss(_mm_load_ss(&samples[k]), _mm_load_ss(&kernel.weights[j + (k - start)]), sum);
94+
output[i] = _mm_cvtss_f32(sum);
95+
}
8996
}
9097
return output;
9198
}
@@ -102,17 +109,25 @@ std::vector<float>& apply_lanczos_filter_fma3(const float *samples, size_t sz, c
102109
for(intmax_t i = 0, k = 0, l = 0; i < bands; ++i)
103110
{
104111
auto vecsum = _mm256_setzero_ps();
105-
auto fsum = 0.0f;
106112
auto count = (intmax_t)band_widths[i];
107113
for(intmax_t j = 0; j < count; ++j, ++k, l += step)
108114
{
109115
auto index = (intmax_t)x[k];
110116
if((index >= 3) && (index < avx_stop))
111117
vecsum = _mm256_fmadd_ps(_mm256_loadu_ps(&samples[index - 3]), _mm256_load_ps(&kernel.weights[l]), vecsum);
112118
else
113-
fsum += lanczos_convolve(samples, sz, kernel, index, l);
119+
{
120+
const auto start = index - 3;
121+
const auto stop = std::min(index + 5, (intmax_t)sz);
122+
123+
// this could be done better with asm, but this'll just have to do
124+
auto sum = _mm_setzero_ps();
125+
for(auto m = std::max(start, (intmax_t)0); m < stop; ++m)
126+
sum = _mm_fmadd_ss(_mm_load_ss(&samples[m]), _mm_load_ss(&kernel.weights[l + (m - start)]), sum);
127+
vecsum = _mm256_insertf128_ps(vecsum, _mm_add_ss(_mm256_castps256_ps128(vecsum), sum), 0);
128+
}
114129
}
115-
output[i] = (fsum + horizontal_sum(vecsum)) / count;
130+
output[i] = horizontal_sum(vecsum) / count;
116131
}
117132
return output;
118133
}

0 commit comments

Comments
 (0)