Add an audio function to retrieve the audio data since last time.

tamo · tamo · commit 419aee36e4a5 · 2025-01-02T00:18:09.000+09:00
Without it, `stream --save-audio` produces somehow choppy wav:
`stream` calculates t_diff in milliseconds
and combine audio pieces which are about step_ms long.

WHISPER_SAMPLE_RATE / 1000 == only 16

but surprisingly human ears seem to be able to hear the gap
as a noise.
diff --git a/examples/common-sdl.cpp b/examples/common-sdl.cpp
@@ -130,6 +130,7 @@ bool audio_async::clear() {
 
         m_audio_pos = 0;
         m_audio_len = 0;
+        m_audio_nxt = 0;
     }
 
     return true;
@@ -172,6 +173,28 @@ void audio_async::callback(uint8_t * stream, int len) {
 }
 
 void audio_async::get(int ms, std::vector<float> & result) {
+    if (ms <= 0) {
+        ms = m_len_ms;
+    }
+
+    size_t n_samples = std::min<size_t>(m_audio_len, (m_sample_rate * ms) / 1000);
+
+    get_n(n_samples, result);
+}
+
+void audio_async::next(std::vector<float> & result) {
+    size_t n_samples;
+
+    if (m_audio_pos >= m_audio_nxt) {
+        n_samples = m_audio_pos - m_audio_nxt;
+    } else {
+        n_samples = m_audio_len - m_audio_nxt + m_audio_pos;
+    }
+
+    get_n(n_samples, result);
+}
+
+void audio_async::get_n(size_t n_samples, std::vector<float> & result) {
     if (!m_dev_id_in) {
         fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
         return;
@@ -182,20 +205,9 @@ void audio_async::get(int ms, std::vector<float> & result) {
         return;
     }
 
-    result.clear();
-
     {
         std::lock_guard<std::mutex> lock(m_mutex);
 
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-
         result.resize(n_samples);
 
         int s0 = m_audio_pos - n_samples;
@@ -205,10 +217,12 @@ void audio_async::get(int ms, std::vector<float> & result) {
 
         if (s0 + n_samples > m_audio.size()) {
             const size_t n0 = m_audio.size() - s0;
+            m_audio_nxt = n_samples - n0;
 
             memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], m_audio_nxt * sizeof(float));
         } else {
+            m_audio_nxt = s0 + n_samples;
             memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
         }
     }
diff --git a/examples/common-sdl.h b/examples/common-sdl.h
@@ -30,6 +30,8 @@ class audio_async {
 
     // get audio data from the circular buffer
     void get(int ms, std::vector<float> & audio);
+    void next(std::vector<float> & audio);
+    void get_n(size_t n_samples, std::vector<float> & audio);
 
 private:
     SDL_AudioDeviceID m_dev_id_in = 0;
@@ -43,6 +45,7 @@ class audio_async {
     std::vector<float> m_audio;
     size_t             m_audio_pos = 0;
     size_t             m_audio_len = 0;
+    size_t             m_audio_nxt = 0;
 };
 
 // Return false if need to quit
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
@@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
 
         if (!use_vad) {
             while (true) {
-                audio.get(params.step_ms, pcmf32_new);
+                audio.next(pcmf32_new);
 
                 if ((int) pcmf32_new.size() > 2*n_samples_step) {
                     fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
@@ -250,7 +250,6 @@ int main(int argc, char ** argv) {
                 }
 
                 if ((int) pcmf32_new.size() >= n_samples_step) {
-                    audio.clear();
                     break;
                 }
 

Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ int main(int argc, char ** argv) {`
`241`	`241`
`242`	`242`	`if (!use_vad) {`
`243`	`243`	`while (true) {`
`244`		`- audio.get(params.step_ms, pcmf32_new);`
	`244`	`+ audio.next(pcmf32_new);`
`245`	`245`
`246`	`246`	`if ((int) pcmf32_new.size() > 2*n_samples_step) {`
`247`	`247`	`fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);`
`@@ -250,7 +250,6 @@ int main(int argc, char ** argv) {`
`250`	`250`	`}`
`251`	`251`
`252`	`252`	`if ((int) pcmf32_new.size() >= n_samples_step) {`
`253`		`- audio.clear();`
`254`	`253`	`break;`
`255`	`254`	`}`
`256`	`255`