@@ -193,27 +193,40 @@ void CapturePipeline::on_audio_chunk(const AudioChunkMeta& meta,
193193 }
194194
195195 // Update level metering (use boosted data)
196+ // Switch hoisted outside the loop so the inner loop is tight and vectorisable.
196197 int32_t peak = 0 ;
197198 double sum_sq = 0.0 ;
198- for (size_t i = 0 ; i < sample_count; ++i) {
199- int32_t s = 0 ;
200- const uint8_t * p = out_data + i * bytes_per_sample;
201- switch (meta.bits_per_sample ) {
202- case 16 :
203- s = static_cast <int16_t >(p[0 ] | (p[1 ] << 8 ));
204- break ;
205- case 24 :
206- s = p[0 ] | (p[1 ] << 8 ) | (p[2 ] << 16 );
207- if (s & 0x800000 ) s |= 0xFF000000 ; // sign extend
208- break ;
209- case 32 :
210- s = static_cast <int32_t >(p[0 ] | (p[1 ] << 8 ) | (p[2 ] << 16 ) | (p[3 ] << 24 ));
211- s >>= 16 ; // scale to 16-bit range for metering
212- break ;
199+ switch (meta.bits_per_sample ) {
200+ case 16 : {
201+ const int16_t * samples16 = reinterpret_cast <const int16_t *>(out_data);
202+ for (size_t i = 0 ; i < sample_count; ++i) {
203+ int32_t s = samples16[i];
204+ int32_t abs_s = std::abs (s);
205+ if (abs_s > peak) peak = abs_s;
206+ sum_sq += static_cast <double >(s) * s;
207+ }
208+ break ;
209+ }
210+ case 24 :
211+ for (size_t i = 0 ; i < sample_count; ++i) {
212+ const uint8_t * p = out_data + i * 3 ;
213+ int32_t s = p[0 ] | (p[1 ] << 8 ) | (p[2 ] << 16 );
214+ if (s & 0x800000 ) s |= 0xFF000000 ;
215+ int32_t abs_s = std::abs (s);
216+ if (abs_s > peak) peak = abs_s;
217+ sum_sq += static_cast <double >(s) * s;
218+ }
219+ break ;
220+ case 32 : {
221+ const int32_t * samples32 = reinterpret_cast <const int32_t *>(out_data);
222+ for (size_t i = 0 ; i < sample_count; ++i) {
223+ int32_t s = samples32[i] >> 16 ;
224+ int32_t abs_s = std::abs (s);
225+ if (abs_s > peak) peak = abs_s;
226+ sum_sq += static_cast <double >(s) * s;
227+ }
228+ break ;
213229 }
214- int32_t abs_s = std::abs (s);
215- if (abs_s > peak) peak = abs_s;
216- sum_sq += static_cast <double >(s) * s;
217230 }
218231 peak_level_.store (static_cast <int16_t >(std::min (peak, (int32_t )32767 )), std::memory_order_relaxed);
219232 rms_level_.store (std::sqrt (sum_sq / static_cast <double >(sample_count)),
@@ -224,41 +237,38 @@ void CapturePipeline::on_audio_chunk(const AudioChunkMeta& meta,
224237void CapturePipeline::apply_gain (const uint8_t * src, uint8_t * dst,
225238 size_t sample_count, uint16_t bits_per_sample,
226239 double gain) const {
227- size_t bps = bits_per_sample / 8 ;
228- for (size_t i = 0 ; i < sample_count; ++i) {
229- const uint8_t * sp = src + i * bps;
230- uint8_t * dp = dst + i * bps;
231- switch (bits_per_sample) {
232- case 16 : {
233- int32_t s = static_cast <int16_t >(sp[0 ] | (sp[1 ] << 8 ));
234- s = static_cast <int32_t >(s * gain);
235- s = std::clamp (s, (int32_t )-32768 , (int32_t )32767 );
236- dp[0 ] = static_cast <uint8_t >(s & 0xFF );
237- dp[1 ] = static_cast <uint8_t >((s >> 8 ) & 0xFF );
238- break ;
240+ switch (bits_per_sample) {
241+ case 16 : {
242+ const int16_t * sp = reinterpret_cast <const int16_t *>(src);
243+ int16_t * dp = reinterpret_cast <int16_t *>(dst);
244+ for (size_t i = 0 ; i < sample_count; ++i) {
245+ int32_t s = static_cast <int32_t >(sp[i] * gain);
246+ dp[i] = static_cast <int16_t >(std::clamp (s, (int32_t )-32768 , (int32_t )32767 ));
239247 }
240- case 24 : {
241- int32_t s = sp[0 ] | (sp[1 ] << 8 ) | (sp[2 ] << 16 );
242- if (s & 0x800000 ) s |= 0xFF000000 ;
243- s = static_cast <int32_t >(s * gain);
244- s = std::clamp (s, (int32_t )-8388608 , (int32_t )8388607 );
245- dp[0 ] = static_cast <uint8_t >(s & 0xFF );
246- dp[1 ] = static_cast <uint8_t >((s >> 8 ) & 0xFF );
247- dp[2 ] = static_cast <uint8_t >((s >> 16 ) & 0xFF );
248- break ;
248+ break ;
249+ }
250+ case 24 :
251+ for (size_t i = 0 ; i < sample_count; ++i) {
252+ const uint8_t * s = src + i * 3 ;
253+ uint8_t * d = dst + i * 3 ;
254+ int32_t v = s[0 ] | (s[1 ] << 8 ) | (s[2 ] << 16 );
255+ if (v & 0x800000 ) v |= 0xFF000000 ;
256+ v = static_cast <int32_t >(v * gain);
257+ v = std::clamp (v, (int32_t )-8388608 , (int32_t )8388607 );
258+ d[0 ] = static_cast <uint8_t >(v & 0xFF );
259+ d[1 ] = static_cast <uint8_t >((v >> 8 ) & 0xFF );
260+ d[2 ] = static_cast <uint8_t >((v >> 16 ) & 0xFF );
249261 }
250- case 32 : {
251- int64_t s = static_cast <int32_t >(
252- sp[0 ] | (sp[1 ] << 8 ) | (sp[2 ] << 16 ) | (sp[3 ] << 24 ));
253- s = static_cast <int64_t >(s * gain);
254- int32_t c = static_cast <int32_t >(std::clamp (
262+ break ;
263+ case 32 : {
264+ const int32_t * sp = reinterpret_cast <const int32_t *>(src);
265+ int32_t * dp = reinterpret_cast <int32_t *>(dst);
266+ for (size_t i = 0 ; i < sample_count; ++i) {
267+ int64_t s = static_cast <int64_t >(sp[i] * gain);
268+ dp[i] = static_cast <int32_t >(std::clamp (
255269 s, (int64_t )INT32_MIN, (int64_t )INT32_MAX));
256- dp[0 ] = static_cast <uint8_t >(c & 0xFF );
257- dp[1 ] = static_cast <uint8_t >((c >> 8 ) & 0xFF );
258- dp[2 ] = static_cast <uint8_t >((c >> 16 ) & 0xFF );
259- dp[3 ] = static_cast <uint8_t >((c >> 24 ) & 0xFF );
260- break ;
261270 }
271+ break ;
262272 }
263273 }
264274}
@@ -267,62 +277,56 @@ void CapturePipeline::apply_dc_remove(uint8_t* data, size_t sample_count,
267277 uint16_t channels,
268278 uint16_t bits_per_sample) {
269279 // Single-pole high-pass IIR: y[n] = x[n] - x[n-1] + alpha * y[n-1]
270- size_t bps = bits_per_sample / 8 ;
271280 size_t frame_count = sample_count / channels;
272281
273- for (size_t f = 0 ; f < frame_count; ++f) {
274- for (uint16_t ch = 0 ; ch < channels; ++ch) {
275- size_t idx = f * channels + ch;
276- uint8_t * p = data + idx * bps;
277- double x = 0.0 ;
278-
279- switch (bits_per_sample) {
280- case 16 :
281- x = static_cast <int16_t >(p[0 ] | (p[1 ] << 8 ));
282- break ;
283- case 24 : {
284- int32_t s = p[0 ] | (p[1 ] << 8 ) | (p[2 ] << 16 );
285- if (s & 0x800000 ) s |= 0xFF000000 ;
286- x = s;
287- break ;
282+ switch (bits_per_sample) {
283+ case 16 : {
284+ int16_t * samples = reinterpret_cast <int16_t *>(data);
285+ for (size_t f = 0 ; f < frame_count; ++f) {
286+ for (uint16_t ch = 0 ; ch < channels; ++ch) {
287+ double x = samples[f * channels + ch];
288+ double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
289+ dc_prev_x_[ch] = x;
290+ dc_prev_y_[ch] = y;
291+ samples[f * channels + ch] = static_cast <int16_t >(
292+ std::clamp (static_cast <int32_t >(y), (int32_t )-32768 , (int32_t )32767 ));
288293 }
289- case 32 :
290- x = static_cast <int32_t >(
291- p[0 ] | (p[1 ] << 8 ) | (p[2 ] << 16 ) | (p[3 ] << 24 ));
292- break ;
293294 }
294-
295- double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
296- dc_prev_x_[ch] = x;
297- dc_prev_y_[ch] = y;
298-
299- switch (bits_per_sample) {
300- case 16 : {
301- int32_t out = std::clamp (static_cast <int32_t >(y),
302- (int32_t )-32768 , (int32_t )32767 );
303- p[0 ] = static_cast <uint8_t >(out & 0xFF );
304- p[1 ] = static_cast <uint8_t >((out >> 8 ) & 0xFF );
305- break ;
306- }
307- case 24 : {
295+ break ;
296+ }
297+ case 24 :
298+ for (size_t f = 0 ; f < frame_count; ++f) {
299+ for (uint16_t ch = 0 ; ch < channels; ++ch) {
300+ size_t idx = (f * channels + ch) * 3 ;
301+ uint8_t * p = data + idx;
302+ int32_t s = p[0 ] | (p[1 ] << 8 ) | (p[2 ] << 16 );
303+ if (s & 0x800000 ) s |= 0xFF000000 ;
304+ double x = s;
305+ double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
306+ dc_prev_x_[ch] = x;
307+ dc_prev_y_[ch] = y;
308308 int32_t out = std::clamp (static_cast <int32_t >(y),
309309 (int32_t )-8388608 , (int32_t )8388607 );
310310 p[0 ] = static_cast <uint8_t >(out & 0xFF );
311311 p[1 ] = static_cast <uint8_t >((out >> 8 ) & 0xFF );
312312 p[2 ] = static_cast <uint8_t >((out >> 16 ) & 0xFF );
313- break ;
314313 }
315- case 32 : {
314+ }
315+ break ;
316+ case 32 : {
317+ int32_t * samples = reinterpret_cast <int32_t *>(data);
318+ for (size_t f = 0 ; f < frame_count; ++f) {
319+ for (uint16_t ch = 0 ; ch < channels; ++ch) {
320+ double x = samples[f * channels + ch];
321+ double y = x - dc_prev_x_[ch] + DC_ALPHA * dc_prev_y_[ch];
322+ dc_prev_x_[ch] = x;
323+ dc_prev_y_[ch] = y;
316324 int64_t out64 = static_cast <int64_t >(y);
317- int32_t out = static_cast <int32_t >(std::clamp (
325+ samples[f * channels + ch] = static_cast <int32_t >(std::clamp (
318326 out64, (int64_t )INT32_MIN, (int64_t )INT32_MAX));
319- p[0 ] = static_cast <uint8_t >(out & 0xFF );
320- p[1 ] = static_cast <uint8_t >((out >> 8 ) & 0xFF );
321- p[2 ] = static_cast <uint8_t >((out >> 16 ) & 0xFF );
322- p[3 ] = static_cast <uint8_t >((out >> 24 ) & 0xFF );
323- break ;
324327 }
325328 }
329+ break ;
326330 }
327331 }
328332}
0 commit comments