Skip to content

Commit 449dbea

Browse files
committed
Mixer: Rework for performance, particularly of the S16 case
This removes downscaling (halving-add) when multiple voices are being mixed. To avoid clipping, and get similar behavior to before, set the "level" of each voice to (1/voice_count). Slow paths that were applicable to only M0 chips were removed. As a side effect, the internal volume representation is now 0 .. 0x8000 (inclusive), which additionally makes a level of exactly 0.5 representable. Testing performed, on PyGamer: For all 4 data cases, for stereo and mono, for 1 and 2 voices, play pure sign waves represented as RawSamples and view the result on a scope and through headphones. Also, scope the amount of time spent in background tasks. Code size: growth of +272 bytes Performance (time in background task when mixing 2 stereo 16-bit voices): 76us per down from 135us (once per ~2.9ms long term average) (Decrease from 4.7% to 2.4% of all CPU time)
1 parent e6869c8 commit 449dbea

File tree

3 files changed

+109
-231
lines changed

3 files changed

+109
-231
lines changed

shared-module/audiomixer/Mixer.c

Lines changed: 105 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -101,213 +101,60 @@ void audiomixer_mixer_reset_buffer(audiomixer_mixer_obj_t* self,
101101
}
102102
}
103103

104-
uint32_t add8signed(uint32_t a, uint32_t b) {
105-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
106-
return __SHADD8(a, b);
107-
#else
108-
uint32_t result = 0;
109-
for (int8_t i = 0; i < 4; i++) {
110-
int8_t ai = a >> (sizeof(int8_t) * 8 * i);
111-
int8_t bi = b >> (sizeof(int8_t) * 8 * i);
112-
int32_t intermediate = (int32_t) ai + bi / 2;
113-
if (intermediate > CHAR_MAX) {
114-
intermediate = CHAR_MAX;
115-
} else if (intermediate < CHAR_MIN) {
116-
intermediate = CHAR_MIN;
117-
}
118-
result |= ((uint32_t) intermediate & 0xff) << (sizeof(int8_t) * 8 * i);
119-
}
120-
return result;
121-
#endif
104+
__attribute__((always_inline))
105+
static inline uint32_t add16signed(uint32_t a, uint32_t b) {
106+
return __QADD16(a, b);
122107
}
123108

124-
uint32_t add8unsigned(uint32_t a, uint32_t b) {
125-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
126-
return __UHADD8(a, b);
127-
#else
128-
uint32_t result = 0;
129-
for (int8_t i = 0; i < 4; i++) {
130-
uint8_t ai = (a >> (sizeof(uint8_t) * 8 * i));
131-
uint8_t bi = (b >> (sizeof(uint8_t) * 8 * i));
132-
int32_t intermediate = (int32_t) (ai + bi) / 2;
133-
if (intermediate > UCHAR_MAX) {
134-
intermediate = UCHAR_MAX;
135-
}
136-
result |= ((uint32_t) intermediate & 0xff) << (sizeof(uint8_t) * 8 * i);
137-
}
138-
return result;
139-
#endif
109+
__attribute__((always_inline))
110+
static inline uint32_t mult16signed(uint32_t val, int32_t mul) {
111+
mul <<= 16;
112+
int32_t hi, lo;
113+
enum { bits = 16 }; // saturate to 16 bits
114+
enum { shift = 15 }; // shift is done automatically
115+
asm volatile("smulwb %0, %1, %2" : "=r" (lo) : "r" (mul), "r" (val));
116+
asm volatile("smulwt %0, %1, %2" : "=r" (hi) : "r" (mul), "r" (val));
117+
asm volatile("ssat %0, %1, %2, asr %3" : "=r" (lo) : "I" (bits), "r" (lo), "I" (shift));
118+
asm volatile("ssat %0, %1, %2, asr %3" : "=r" (hi) : "I" (bits), "r" (hi), "I" (shift));
119+
asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (val) : "r" (lo), "r" (hi)); // pack
120+
return val;
140121
}
141122

142-
uint32_t add16signed(uint32_t a, uint32_t b) {
143-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
144-
return __SHADD16(a, b);
145-
#else
146-
uint32_t result = 0;
147-
for (int8_t i = 0; i < 2; i++) {
148-
int16_t ai = a >> (sizeof(int16_t) * 8 * i);
149-
int16_t bi = b >> (sizeof(int16_t) * 8 * i);
150-
int32_t intermediate = (int32_t) ai + bi / 2;
151-
if (intermediate > SHRT_MAX) {
152-
intermediate = SHRT_MAX;
153-
} else if (intermediate < SHRT_MIN) {
154-
intermediate = SHRT_MIN;
155-
}
156-
result |= (((uint32_t) intermediate) & 0xffff) << (sizeof(int16_t) * 8 * i);
157-
}
158-
return result;
159-
#endif
123+
static inline uint32_t tounsigned8(uint32_t val) {
124+
return __UADD8(val, 0x80808080);
160125
}
161126

162-
uint32_t add16unsigned(uint32_t a, uint32_t b) {
163-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
164-
return __UHADD16(a, b);
165-
#else
166-
uint32_t result = 0;
167-
for (int8_t i = 0; i < 2; i++) {
168-
int16_t ai = (a >> (sizeof(uint16_t) * 8 * i)) - 0x8000;
169-
int16_t bi = (b >> (sizeof(uint16_t) * 8 * i)) - 0x8000;
170-
int32_t intermediate = (int32_t) ai + bi / 2;
171-
if (intermediate > USHRT_MAX) {
172-
intermediate = USHRT_MAX;
173-
}
174-
result |= ((uint32_t) intermediate & 0xffff) << (sizeof(int16_t) * 8 * i);
175-
}
176-
return result;
177-
#endif
127+
static inline uint32_t tounsigned16(uint32_t val) {
128+
return __UADD16(val, 0x80008000);
178129
}
179130

180-
static inline uint32_t mult8unsigned(uint32_t val, int32_t mul) {
181-
// if mul == 0, no need in wasting cycles
182-
if (mul == 0) {
183-
return 0;
184-
}
185-
/* TODO: workout ARMv7 instructions
186-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
187-
return val;
188-
#else*/
189-
uint32_t result = 0;
190-
float mod_mul = (float) mul / (float) ((1<<15)-1);
191-
for (int8_t i = 0; i < 4; i++) {
192-
uint8_t ai = val >> (sizeof(uint8_t) * 8 * i);
193-
int32_t intermediate = ai * mod_mul;
194-
if (intermediate > SHRT_MAX) {
195-
intermediate = SHRT_MAX;
196-
}
197-
result |= ((uint32_t) intermediate & 0xff) << (sizeof(uint8_t) * 8 * i);
198-
}
199-
200-
return result;
201-
//#endif
202-
}
203-
204-
static inline uint32_t mult8signed(uint32_t val, int32_t mul) {
205-
// if mul == 0, no need in wasting cycles
206-
if (mul == 0) {
207-
return 0;
208-
}
209-
/* TODO: workout ARMv7 instructions
210-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
211-
return val;
212-
#else
213-
*/
214-
uint32_t result = 0;
215-
float mod_mul = (float)mul / (float)((1<<15)-1);
216-
for (int8_t i = 0; i < 4; i++) {
217-
int16_t ai = val >> (sizeof(int8_t) * 8 * i);
218-
int32_t intermediate = ai * mod_mul;
219-
if (intermediate > CHAR_MAX) {
220-
intermediate = CHAR_MAX;
221-
} else if (intermediate < CHAR_MIN) {
222-
intermediate = CHAR_MIN;
223-
}
224-
result |= (((uint32_t) intermediate) & 0xff) << (sizeof(int16_t) * 8 * i);
225-
}
226-
return result;
227-
//#endif
131+
static inline uint32_t tosigned16(uint32_t val) {
132+
return __UADD16(val, 0x80008000);
228133
}
229134

230-
//TODO:
231-
static inline uint32_t mult16unsigned(uint32_t val, int32_t mul) {
232-
// if mul == 0, no need in wasting cycles
233-
if (mul == 0) {
234-
return 0;
235-
}
236-
/* TODO: the below ARMv7m instructions "work", but the amplitude is much higher/louder
237-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
238-
// there is no unsigned equivalent to the 'SMULWx' ARMv7 Thumb function,
239-
// so we have to do it by hand.
240-
uint32_t lo = val & 0xffff;
241-
uint32_t hi = val >> 16;
242-
//mp_printf(&mp_plat_print, "pre-asm: (mul: %d)\n\tval: %x\tlo: %x\thi: %x\n", mul, val, lo, hi);
243-
uint32_t val_lo;
244-
asm volatile("mul %0, %1, %2" : "=r" (val_lo) : "r" (mul), "r" (lo));
245-
asm volatile("mla %0, %1, %2, %3" : "=r" (val) : "r" (mul), "r" (hi), "r" (val_lo));
246-
//mp_printf(&mp_plat_print, "post-asm:\n\tval: %x\tlo: %x\n\n", val, val_lo);
247-
return val;
248-
#else
249-
*/
250-
uint32_t result = 0;
251-
float mod_mul = (float)mul / (float)((1<<15)-1);
252-
for (int8_t i = 0; i < 2; i++) {
253-
int16_t ai = (val >> (sizeof(uint16_t) * 8 * i)) - 0x8000;
254-
int32_t intermediate = ai * mod_mul;
255-
if (intermediate > SHRT_MAX) {
256-
intermediate = SHRT_MAX;
257-
} else if (intermediate < SHRT_MIN) {
258-
intermediate = SHRT_MIN;
259-
}
260-
result |= (((uint32_t) intermediate) + 0x8000) << (sizeof(int16_t) * 8 * i);
261-
}
262-
return result;
263-
//#endif
135+
static inline uint32_t unpack8(uint16_t val) {
136+
return ((val & 0xff00) << 16) | ((val & 0x00ff) << 8);
264137
}
265138

266-
static inline uint32_t mult16signed(uint32_t val, int32_t mul) {
267-
// if mul == 0, no need in wasting cycles
268-
if (mul == 0) {
269-
return 0;
270-
}
271-
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
272-
int32_t hi, lo;
273-
enum { bits = 16 }; // saturate to 16 bits
274-
enum { shift = 0 }; // shift is done automatically
275-
asm volatile("smulwb %0, %1, %2" : "=r" (lo) : "r" (mul), "r" (val));
276-
asm volatile("smulwt %0, %1, %2" : "=r" (hi) : "r" (mul), "r" (val));
277-
asm volatile("ssat %0, %1, %2, asr %3" : "=r" (lo) : "I" (bits), "r" (lo), "I" (shift));
278-
asm volatile("ssat %0, %1, %2, asr %3" : "=r" (hi) : "I" (bits), "r" (hi), "I" (shift));
279-
asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (val) : "r" (lo), "r" (hi)); // pack
280-
return val;
281-
#else
282-
uint32_t result = 0;
283-
float mod_mul = (float)mul / (float)((1<<15)-1);
284-
for (int8_t i = 0; i < 2; i++) {
285-
int16_t ai = val >> (sizeof(int16_t) * 8 * i);
286-
int32_t intermediate = ai * mod_mul;
287-
if (intermediate > SHRT_MAX) {
288-
intermediate = SHRT_MAX;
289-
} else if (intermediate < SHRT_MIN) {
290-
intermediate = SHRT_MIN;
291-
}
292-
result |= (((uint32_t) intermediate) & 0xffff) << (sizeof(int16_t) * 8 * i);
293-
}
294-
return result;
295-
#endif
139+
static inline uint32_t pack8(uint32_t val) {
140+
return ((val & 0xff000000) >> 16) | ((val & 0xff00) >> 8);
296141
}
297142

143+
#define LIKELY(x) (__builtin_expect(!!(x), 1))
144+
#define UNLIKELY(x) (__builtin_expect(!!(x), 0))
298145
static void mix_one_voice(audiomixer_mixer_obj_t* self,
299146
audiomixer_mixervoice_obj_t* voice, bool voices_active,
300147
uint32_t* word_buffer, uint32_t length) {
301-
uint32_t j = 0;
302148
bool voice_done = voice->sample == NULL;
303-
for (uint32_t i = 0; i < length; i++) {
304-
if (!voice_done && j >= voice->buffer_length) {
149+
while (!voice_done && length != 0) {
150+
if (voice->buffer_length == 0) {
305151
if (!voice->more_data) {
306152
if (voice->loop) {
307153
audiosample_reset_buffer(voice->sample, false, 0);
308154
} else {
309155
voice->sample = NULL;
310156
voice_done = true;
157+
break;
311158
}
312159
}
313160
if (!voice_done) {
@@ -316,64 +163,81 @@ static void mix_one_voice(audiomixer_mixer_obj_t* self,
316163
// Track length in terms of words.
317164
voice->buffer_length /= sizeof(uint32_t);
318165
voice->more_data = result == GET_BUFFER_MORE_DATA;
319-
j = 0;
320166
}
321167
}
168+
169+
uint32_t n = MIN(voice->buffer_length, length);
170+
uint32_t *src = voice->remaining_buffer;
171+
uint16_t level = voice->level;
172+
322173
// First active voice gets copied over verbatim.
323-
uint32_t sample_value;
324-
if (voice_done) {
325-
// Exit early if another voice already set all samples once.
326-
if (voices_active) {
327-
continue;
328-
}
329-
sample_value = 0;
330-
if (!self->samples_signed) {
331-
if (self->bits_per_sample == 8) {
332-
sample_value = 0x7f7f7f7f;
174+
if (!voices_active) {
175+
if (LIKELY(self->bits_per_sample == 16)) {
176+
if (LIKELY(self->samples_signed)) {
177+
for (uint32_t i = 0; i<n; i++) {
178+
uint32_t v = src[i];
179+
word_buffer[i] = mult16signed(v, level);
180+
}
333181
} else {
334-
sample_value = 0x7fff7fff;
182+
for (uint32_t i = 0; i<n; i++) {
183+
uint32_t v = src[i];
184+
v = tosigned16(v);
185+
word_buffer[i] = mult16signed(v, level);
186+
}
335187
}
336-
}
337-
} else {
338-
sample_value = voice->remaining_buffer[j];
339-
}
340-
341-
// apply the mixer level
342-
if (!self->samples_signed) {
343-
if (self->bits_per_sample == 8) {
344-
sample_value = mult8unsigned(sample_value, voice->level);
345-
} else {
346-
sample_value = mult16unsigned(sample_value, voice->level);
347-
}
348-
} else {
349-
if (self->bits_per_sample == 8) {
350-
sample_value = mult8signed(sample_value, voice->level);
351188
} else {
352-
sample_value = mult16signed(sample_value, voice->level);
189+
uint16_t *hword_buffer = (uint16_t*)word_buffer;
190+
uint16_t *hsrc = (uint16_t*)src;
191+
for (uint32_t i = 0; i<n*2; i++) {
192+
uint32_t word = unpack8(hsrc[i]);
193+
if (LIKELY(!self->samples_signed)) {
194+
word = tosigned16(word);
195+
}
196+
word = mult16signed(word, level);
197+
hword_buffer[i] = pack8(word);
198+
}
353199
}
354-
}
355-
356-
if (!voices_active) {
357-
word_buffer[i] = sample_value;
358200
} else {
359-
if (self->bits_per_sample == 8) {
360-
if (self->samples_signed) {
361-
word_buffer[i] = add8signed(word_buffer[i], sample_value);
201+
if (LIKELY(self->bits_per_sample == 16)) {
202+
if (LIKELY(self->samples_signed)) {
203+
for (uint32_t i = 0; i<n; i++) {
204+
uint32_t word = src[i];
205+
word_buffer[i] = add16signed(mult16signed(word, level), word_buffer[i]);
206+
}
362207
} else {
363-
word_buffer[i] = add8unsigned(word_buffer[i], sample_value);
208+
for (uint32_t i = 0; i<n; i++) {
209+
uint32_t word = src[i];
210+
word = tosigned16(word);
211+
word_buffer[i] = add16signed(mult16signed(word, level), word_buffer[i]);
212+
}
364213
}
365214
} else {
366-
if (self->samples_signed) {
367-
word_buffer[i] = add16signed(word_buffer[i], sample_value);
368-
} else {
369-
word_buffer[i] = add16unsigned(word_buffer[i], sample_value);
215+
uint16_t *hword_buffer = (uint16_t*)word_buffer;
216+
uint16_t *hsrc = (uint16_t*)src;
217+
for (uint32_t i = 0; i<n*2; i++) {
218+
uint32_t word = unpack8(hsrc[i]);
219+
if (LIKELY(!self->samples_signed)) {
220+
word = tosigned16(word);
221+
}
222+
word = mult16signed(word, level);
223+
word = add16signed(word, unpack8(hword_buffer[i]));
224+
hword_buffer[i] = pack8(word);
370225
}
371226
}
372227
}
373-
j++;
228+
length -= n;
229+
word_buffer += n;
230+
voice->remaining_buffer += n;
231+
voice->buffer_length -= n;
232+
}
233+
234+
if (length && !voices_active) {
235+
uint32_t sample_value = self->bits_per_sample == 8
236+
? 0x80808080 : 0x80008000;
237+
for (uint32_t i = 0; i<length; i++) {
238+
word_buffer[i] = sample_value;
239+
}
374240
}
375-
voice->buffer_length -= j;
376-
voice->remaining_buffer += j;
377241
}
378242

379243
audioio_get_buffer_result_t audiomixer_mixer_get_buffer(audiomixer_mixer_obj_t* self,
@@ -403,13 +267,27 @@ audioio_get_buffer_result_t audiomixer_mixer_get_buffer(audiomixer_mixer_obj_t*
403267
}
404268
self->use_first_buffer = !self->use_first_buffer;
405269
bool voices_active = false;
270+
uint32_t length = self->len / sizeof(uint32_t);
271+
406272
for (int32_t v = 0; v < self->voice_count; v++) {
407273
audiomixer_mixervoice_obj_t* voice = MP_OBJ_TO_PTR(self->voice[v]);
408274

409-
mix_one_voice(self, voice, voices_active, word_buffer, self->len / sizeof(uint32_t));
275+
mix_one_voice(self, voice, voices_active, word_buffer, length);
410276
voices_active = true;
411277
}
412278

279+
if (!self->samples_signed) {
280+
if (self->bits_per_sample == 16) {
281+
for (uint32_t i = 0; i < length; i++) {
282+
word_buffer[i] = tounsigned16(word_buffer[i]);
283+
}
284+
} else {
285+
for (uint32_t i = 0; i < length; i++) {
286+
word_buffer[i] = tounsigned8(word_buffer[i]);
287+
}
288+
}
289+
}
290+
413291
self->read_count += 1;
414292
} else if (!self->use_first_buffer) {
415293
*buffer = (uint8_t*) self->first_buffer;

0 commit comments

Comments
 (0)