Skip to content

Commit f3b9340

Browse files
committed
Vectorise some loops
1 parent b388e9a commit f3b9340

File tree

1 file changed

+95
-91
lines changed

1 file changed

+95
-91
lines changed

src/pipeline/capture_pipeline.cpp

Lines changed: 95 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -193,27 +193,40 @@ void CapturePipeline::on_audio_chunk(const AudioChunkMeta& meta,
193193
}
194194

195195
// Update level metering (use boosted data)
196+
// Switch hoisted outside the loop so the inner loop is tight and vectorisable.
196197
int32_t peak = 0;
197198
double sum_sq = 0.0;
198-
for (size_t i = 0; i < sample_count; ++i) {
199-
int32_t s = 0;
200-
const uint8_t* p = out_data + i * bytes_per_sample;
201-
switch (meta.bits_per_sample) {
202-
case 16:
203-
s = static_cast<int16_t>(p[0] | (p[1] << 8));
204-
break;
205-
case 24:
206-
s = p[0] | (p[1] << 8) | (p[2] << 16);
207-
if (s & 0x800000) s |= 0xFF000000; // sign extend
208-
break;
209-
case 32:
210-
s = static_cast<int32_t>(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
211-
s >>= 16; // scale to 16-bit range for metering
212-
break;
199+
switch (meta.bits_per_sample) {
200+
case 16: {
201+
const int16_t* samples16 = reinterpret_cast<const int16_t*>(out_data);
202+
for (size_t i = 0; i < sample_count; ++i) {
203+
int32_t s = samples16[i];
204+
int32_t abs_s = std::abs(s);
205+
if (abs_s > peak) peak = abs_s;
206+
sum_sq += static_cast<double>(s) * s;
207+
}
208+
break;
209+
}
210+
case 24:
211+
for (size_t i = 0; i < sample_count; ++i) {
212+
const uint8_t* p = out_data + i * 3;
213+
int32_t s = p[0] | (p[1] << 8) | (p[2] << 16);
214+
if (s & 0x800000) s |= 0xFF000000;
215+
int32_t abs_s = std::abs(s);
216+
if (abs_s > peak) peak = abs_s;
217+
sum_sq += static_cast<double>(s) * s;
218+
}
219+
break;
220+
case 32: {
221+
const int32_t* samples32 = reinterpret_cast<const int32_t*>(out_data);
222+
for (size_t i = 0; i < sample_count; ++i) {
223+
int32_t s = samples32[i] >> 16;
224+
int32_t abs_s = std::abs(s);
225+
if (abs_s > peak) peak = abs_s;
226+
sum_sq += static_cast<double>(s) * s;
227+
}
228+
break;
213229
}
214-
int32_t abs_s = std::abs(s);
215-
if (abs_s > peak) peak = abs_s;
216-
sum_sq += static_cast<double>(s) * s;
217230
}
218231
peak_level_.store(static_cast<int16_t>(std::min(peak, (int32_t)32767)), std::memory_order_relaxed);
219232
rms_level_.store(std::sqrt(sum_sq / static_cast<double>(sample_count)),
@@ -224,41 +237,38 @@ void CapturePipeline::on_audio_chunk(const AudioChunkMeta& meta,
224237
void CapturePipeline::apply_gain(const uint8_t* src, uint8_t* dst,
225238
size_t sample_count, uint16_t bits_per_sample,
226239
double gain) const {
227-
size_t bps = bits_per_sample / 8;
228-
for (size_t i = 0; i < sample_count; ++i) {
229-
const uint8_t* sp = src + i * bps;
230-
uint8_t* dp = dst + i * bps;
231-
switch (bits_per_sample) {
232-
case 16: {
233-
int32_t s = static_cast<int16_t>(sp[0] | (sp[1] << 8));
234-
s = static_cast<int32_t>(s * gain);
235-
s = std::clamp(s, (int32_t)-32768, (int32_t)32767);
236-
dp[0] = static_cast<uint8_t>(s & 0xFF);
237-
dp[1] = static_cast<uint8_t>((s >> 8) & 0xFF);
238-
break;
240+
switch (bits_per_sample) {
241+
case 16: {
242+
const int16_t* sp = reinterpret_cast<const int16_t*>(src);
243+
int16_t* dp = reinterpret_cast<int16_t*>(dst);
244+
for (size_t i = 0; i < sample_count; ++i) {
245+
int32_t s = static_cast<int32_t>(sp[i] * gain);
246+
dp[i] = static_cast<int16_t>(std::clamp(s, (int32_t)-32768, (int32_t)32767));
239247
}
240-
case 24: {
241-
int32_t s = sp[0] | (sp[1] << 8) | (sp[2] << 16);
242-
if (s & 0x800000) s |= 0xFF000000;
243-
s = static_cast<int32_t>(s * gain);
244-
s = std::clamp(s, (int32_t)-8388608, (int32_t)8388607);
245-
dp[0] = static_cast<uint8_t>(s & 0xFF);
246-
dp[1] = static_cast<uint8_t>((s >> 8) & 0xFF);
247-
dp[2] = static_cast<uint8_t>((s >> 16) & 0xFF);
248-
break;
248+
break;
249+
}
250+
case 24:
251+
for (size_t i = 0; i < sample_count; ++i) {
252+
const uint8_t* s = src + i * 3;
253+
uint8_t* d = dst + i * 3;
254+
int32_t v = s[0] | (s[1] << 8) | (s[2] << 16);
255+
if (v & 0x800000) v |= 0xFF000000;
256+
v = static_cast<int32_t>(v * gain);
257+
v = std::clamp(v, (int32_t)-8388608, (int32_t)8388607);
258+
d[0] = static_cast<uint8_t>(v & 0xFF);
259+
d[1] = static_cast<uint8_t>((v >> 8) & 0xFF);
260+
d[2] = static_cast<uint8_t>((v >> 16) & 0xFF);
249261
}
250-
case 32: {
251-
int64_t s = static_cast<int32_t>(
252-
sp[0] | (sp[1] << 8) | (sp[2] << 16) | (sp[3] << 24));
253-
s = static_cast<int64_t>(s * gain);
254-
int32_t c = static_cast<int32_t>(std::clamp(
262+
break;
263+
case 32: {
264+
const int32_t* sp = reinterpret_cast<const int32_t*>(src);
265+
int32_t* dp = reinterpret_cast<int32_t*>(dst);
266+
for (size_t i = 0; i < sample_count; ++i) {
267+
int64_t s = static_cast<int64_t>(sp[i] * gain);
268+
dp[i] = static_cast<int32_t>(std::clamp(
255269
s, (int64_t)INT32_MIN, (int64_t)INT32_MAX));
256-
dp[0] = static_cast<uint8_t>(c & 0xFF);
257-
dp[1] = static_cast<uint8_t>((c >> 8) & 0xFF);
258-
dp[2] = static_cast<uint8_t>((c >> 16) & 0xFF);
259-
dp[3] = static_cast<uint8_t>((c >> 24) & 0xFF);
260-
break;
261270
}
271+
break;
262272
}
263273
}
264274
}
@@ -267,62 +277,56 @@ void CapturePipeline::apply_dc_remove(uint8_t* data, size_t sample_count,
267277
uint16_t channels,
268278
uint16_t bits_per_sample) {
269279
// Single-pole high-pass IIR: y[n] = x[n] - x[n-1] + alpha * y[n-1]
270-
size_t bps = bits_per_sample / 8;
271280
size_t frame_count = sample_count / channels;
272281

273-
for (size_t f = 0; f < frame_count; ++f) {
274-
for (uint16_t ch = 0; ch < channels; ++ch) {
275-
size_t idx = f * channels + ch;
276-
uint8_t* p = data + idx * bps;
277-
double x = 0.0;
278-
279-
switch (bits_per_sample) {
280-
case 16:
281-
x = static_cast<int16_t>(p[0] | (p[1] << 8));
282-
break;
283-
case 24: {
284-
int32_t s = p[0] | (p[1] << 8) | (p[2] << 16);
285-
if (s & 0x800000) s |= 0xFF000000;
286-
x = s;
287-
break;
282+
switch (bits_per_sample) {
283+
case 16: {
284+
int16_t* samples = reinterpret_cast<int16_t*>(data);
285+
for (size_t f = 0; f < frame_count; ++f) {
286+
for (uint16_t ch = 0; ch < channels; ++ch) {
287+
double x = samples[f * channels + ch];
288+
double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
289+
dc_prev_x_[ch] = x;
290+
dc_prev_y_[ch] = y;
291+
samples[f * channels + ch] = static_cast<int16_t>(
292+
std::clamp(static_cast<int32_t>(y), (int32_t)-32768, (int32_t)32767));
288293
}
289-
case 32:
290-
x = static_cast<int32_t>(
291-
p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
292-
break;
293294
}
294-
295-
double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
296-
dc_prev_x_[ch] = x;
297-
dc_prev_y_[ch] = y;
298-
299-
switch (bits_per_sample) {
300-
case 16: {
301-
int32_t out = std::clamp(static_cast<int32_t>(y),
302-
(int32_t)-32768, (int32_t)32767);
303-
p[0] = static_cast<uint8_t>(out & 0xFF);
304-
p[1] = static_cast<uint8_t>((out >> 8) & 0xFF);
305-
break;
306-
}
307-
case 24: {
295+
break;
296+
}
297+
case 24:
298+
for (size_t f = 0; f < frame_count; ++f) {
299+
for (uint16_t ch = 0; ch < channels; ++ch) {
300+
size_t idx = (f * channels + ch) * 3;
301+
uint8_t* p = data + idx;
302+
int32_t s = p[0] | (p[1] << 8) | (p[2] << 16);
303+
if (s & 0x800000) s |= 0xFF000000;
304+
double x = s;
305+
double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
306+
dc_prev_x_[ch] = x;
307+
dc_prev_y_[ch] = y;
308308
int32_t out = std::clamp(static_cast<int32_t>(y),
309309
(int32_t)-8388608, (int32_t)8388607);
310310
p[0] = static_cast<uint8_t>(out & 0xFF);
311311
p[1] = static_cast<uint8_t>((out >> 8) & 0xFF);
312312
p[2] = static_cast<uint8_t>((out >> 16) & 0xFF);
313-
break;
314313
}
315-
case 32: {
314+
}
315+
break;
316+
case 32: {
317+
int32_t* samples = reinterpret_cast<int32_t*>(data);
318+
for (size_t f = 0; f < frame_count; ++f) {
319+
for (uint16_t ch = 0; ch < channels; ++ch) {
320+
double x = samples[f * channels + ch];
321+
double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
322+
dc_prev_x_[ch] = x;
323+
dc_prev_y_[ch] = y;
316324
int64_t out64 = static_cast<int64_t>(y);
317-
int32_t out = static_cast<int32_t>(std::clamp(
325+
samples[f * channels + ch] = static_cast<int32_t>(std::clamp(
318326
out64, (int64_t)INT32_MIN, (int64_t)INT32_MAX));
319-
p[0] = static_cast<uint8_t>(out & 0xFF);
320-
p[1] = static_cast<uint8_t>((out >> 8) & 0xFF);
321-
p[2] = static_cast<uint8_t>((out >> 16) & 0xFF);
322-
p[3] = static_cast<uint8_t>((out >> 24) & 0xFF);
323-
break;
324327
}
325328
}
329+
break;
326330
}
327331
}
328332
}

0 commit comments

Comments
 (0)