From 48efa9646a6ca477ba69218c7440c0d6257c5ba7 Mon Sep 17 00:00:00 2001
From: Daniel Ritchie <daniel@brainwavecollective.ai>
Date: Mon, 3 Nov 2025 19:18:53 -0700
Subject: [PATCH 1/5] added windows support and improved realtime streaming

---
 my_local_files.txt                         |   3 +
 src/reachy_mini/media/audio_sounddevice.py | 210 ++++++++---
 tests/test_audio_methods.py                | 397 +++++++++++++++++++++
 3 files changed, 565 insertions(+), 45 deletions(-)
 create mode 100644 my_local_files.txt
 create mode 100644 tests/test_audio_methods.py

diff --git a/my_local_files.txt b/my_local_files.txt
new file mode 100644
index 00000000..2e9418aa
--- /dev/null
+++ b/my_local_files.txt
@@ -0,0 +1,3 @@
+
+src/reachy_mini/media/audio_sounddevice.py
+tests/test_audio_methods.py
diff --git a/src/reachy_mini/media/audio_sounddevice.py b/src/reachy_mini/media/audio_sounddevice.py
index 6ac053d3..81593621 100644
--- a/src/reachy_mini/media/audio_sounddevice.py
+++ b/src/reachy_mini/media/audio_sounddevice.py
@@ -3,6 +3,8 @@
 import os
 import threading
 import time
+from collections import deque
+from queue import Queue, Empty
 from typing import Any, List, Optional
 
 import numpy as np
@@ -13,26 +15,35 @@
 
 from reachy_mini.utils.constants import ASSETS_ROOT_PATH
 
-from .audio_base import AudioBase
+from .audio_base import AudioBackend, AudioBase
 
 
 class SoundDeviceAudio(AudioBase):
     """Audio device implementation using sounddevice."""
-
+    
     def __init__(
         self,
-        frames_per_buffer: int = 256,
+        frames_per_buffer: int = 1024,
         log_level: str = "INFO",
     ) -> None:
-        """Initialize the SoundDevice audio device."""
-        super().__init__(log_level=log_level)
+        super().__init__(backend=AudioBackend.SOUNDDEVICE, log_level=log_level)
         self.frames_per_buffer = frames_per_buffer
         self.stream = None
         self._output_stream = None
         self._buffer: List[npt.NDArray[np.float32]] = []
+
+        # Device ids
         self._output_device_id = self.get_output_device_id("respeaker")
         self._input_device_id = self.get_input_device_id("respeaker")
 
+        self._streaming_active = False
+        self._chunk_fifo: deque[npt.NDArray[np.float32]] = deque()
+        self._queued_samples: int = 0
+        self._tail: Optional[npt.NDArray[np.float32]] = None
+        self._target_buffer_ms: int = 120  # tune 80–200 for stability/latency tradeoff
+        self._underflows: int = 0
+        self._overflows: int = 0
+
     def start_recording(self) -> None:
         """Open the audio input stream, using ReSpeaker card if available."""
         self.stream = sd.InputStream(
@@ -40,6 +51,8 @@ def start_recording(self) -> None:
             device=self._input_device_id,
             callback=self._callback,
             samplerate=self.SAMPLE_RATE,
+            channels=1,
+            dtype="float32",
         )
         if self.stream is None:
             raise RuntimeError("Failed to open SoundDevice audio stream.")
@@ -77,32 +90,129 @@ def stop_recording(self) -> None:
             self.logger.info("SoundDevice audio stream closed.")
 
     def push_audio_sample(self, data: npt.NDArray[np.float32]) -> None:
-        """Push audio data to the output device."""
-        if self._output_stream is not None:
-            self._output_stream.write(data)
-        else:
-            self.logger.warning(
-                "Output stream is not open. Call start_playing() first."
-            )
-
+        """Push PCM mono float32 audio into the output FIFO."""
+        if not self._streaming_active or self._output_stream is None:
+            self.logger.warning("Output stream is not active. Call start_playing() first.")
+            return
+
+        # Ensure shape (n,) float32 mono
+        if data.ndim == 2:
+            if data.shape[1] > 1:
+                data = np.mean(data, axis=1)
+            else:
+                data = data[:, 0]
+        data = np.asarray(data, dtype=np.float32, order="C")
+
+        # Push into local FIFO; keep sample count
+        self._chunk_fifo.append(data)
+        self._queued_samples += data.shape[0]
+
+    # --- helper: compute target watermark in samples ---
+    def _target_buffer_samples(self) -> int:
+        return int(self.SAMPLE_RATE * (self._target_buffer_ms / 1000.0))
+        
+        # A modest prebuffer (start at 120 ms) smooths bursty input. Tune 80–200 ms depending on your device/OS. Lower = snappier, higher = safer.
+       
+        # Try self._target_buffer_ms = 150 and/or frames_per_buffer = 512 (lower latency) or 2048 (more stable).
+        
+        # FYI: On Windows with ReSpeaker, WASAPI shared mode can be finicky. Try larger frames_per_buffer (e.g., 2048) first; if still choppy, bump _target_buffer_ms to ~200 ms.
+        
+    # --- improved callback: drain multiple chunks + carry-over ---
+    def _streaming_callback(
+        self,
+        outdata: npt.NDArray[np.float32],
+        frames: int,
+        time: Any,
+        status: sd.CallbackFlags,
+    ) -> None:
+        # Handle PortAudio status flags
+        if status:
+            # Track under/over flows for debugging
+            if status.input_overflow or status.output_underflow:
+                if status.output_underflow:
+                    self._underflows += 1
+                if status.input_overflow:
+                    self._overflows += 1
+                self.logger.debug(f"Audio status: {status} (uf={self._underflows}, of={self._overflows})")
+
+        # Ensure we don't leave stale data in other channels
+        # outdata shape: (frames, channels)
+        out = outdata[:, 0]
+        out[:] = 0.0  # default to silence; we’ll fill what we can
+
+        # Keep a small prebuffer to avoid stutter right after starts or small gaps
+        target = self._target_buffer_samples()
+
+        # ---- read from tail first
+        written = 0
+        if self._tail is not None and self._tail.size:
+            take = min(frames, self._tail.size)
+            out[:take] = self._tail[:take]
+            written += take
+            if take < self._tail.size:
+                self._tail = self._tail[take:]
+            else:
+                self._tail = None
+
+        # ---- then drain FIFO until we fill this block
+        while written < frames:
+            try:
+                if self._queued_samples < target:
+                    # Not enough buffered audio—leave remaining zeros (silence)
+                    break
+
+                # Pull next chunk from local FIFO (not the Queue; producer pushes here)
+                chunk = self._chunk_fifo.popleft()
+                self._queued_samples -= chunk.shape[0]
+
+                need = frames - written
+                if chunk.shape[0] <= need:
+                    out[written:written + chunk.shape[0]] = chunk
+                    written += chunk.shape[0]
+                else:
+                    # Partially consume; keep remainder as tail
+                    out[written:frames] = chunk[:need]
+                    self._tail = chunk[need:]  # carry over to next callback
+                    written = frames
+
+            except IndexError:
+                # FIFO empty
+                break
+
+    # --- start_playing: reset tracking, explicit dtype ---
     def start_playing(self) -> None:
-        """Open the audio output stream."""
+        self._streaming_active = True
+        self._chunk_fifo.clear()
+        self._queued_samples = 0
+        self._tail = None
+        self._underflows = 0
+        self._overflows = 0
+
         self._output_stream = sd.OutputStream(
             samplerate=self.SAMPLE_RATE,
             device=self._output_device_id,
             channels=1,
+            dtype="float32",
+            callback=self._streaming_callback,
+            blocksize=self.frames_per_buffer,
         )
         if self._output_stream is None:
             raise RuntimeError("Failed to open SoundDevice audio output stream.")
         self._output_stream.start()
+        self.logger.info("SoundDevice audio output stream opened (callback mode w/ smoothing).")
 
     def stop_playing(self) -> None:
-        """Close the audio output stream."""
+        self._streaming_active = False
         if self._output_stream is not None:
-            self._output_stream.stop()
-            self._output_stream.close()
+            try:
+                self._output_stream.stop()
+            finally:
+                self._output_stream.close()
             self._output_stream = None
-            self.logger.info("SoundDevice audio output stream closed.")
+        self._chunk_fifo.clear()
+        self._queued_samples = 0
+        self._tail = None
+        self.logger.info("SoundDevice audio output stream closed.")
 
     def play_sound(self, sound_file: str, autoclean: bool = False) -> None:
         """Play a sound file from the assets directory or a given path using sounddevice and soundfile."""
@@ -181,46 +291,56 @@ def _clean_up_thread() -> None:
                 daemon=True,
             ).start()
 
-    def get_output_device_id(self, name_contains: str) -> int:
-        """Return the output device id whose name contains the given string (case-insensitive).
+    def _find_device_id(
+        self, name_contains: str, device_type: str
+    ) -> int:
+        """Find device ID by name and type with fallback logic.
+
+        Args:
+            name_contains: Substring to search for in device name (case-insensitive)
+            device_type: Either "input" or "output"
 
-        If not found, return the default output device id.
+        Returns:
+            Device index
+
+        Raises:
+            RuntimeError: If no device with appropriate channels found
         """
         devices = sd.query_devices()
+        channel_key = f"max_{device_type}_channels"
 
+        # First try: Search for device by specific name (e.g., "respeaker")
         for idx, dev in enumerate(devices):
             if (
                 name_contains.lower() in dev["name"].lower()
-                and dev["max_output_channels"] > 0
+                and dev.get(channel_key, 0) > 0
             ):
                 return idx
-        # Return default output device if not found
+
+        # Log warning if device with specific name not found
         self.logger.warning(
-            f"No output device found containing '{name_contains}', using default."
+            f"No {device_type} device containing '{name_contains}' found. Using first available {device_type} device."
         )
-        return self._safe_query_device("output")
-
-    def get_input_device_id(self, name_contains: str) -> int:
-        """Return the input device id whose name contains the given string (case-insensitive).
-
-        If not found, return the default input device id.
-        """
-        devices = sd.query_devices()
 
+        # Fallback: Return first device with appropriate channels
         for idx, dev in enumerate(devices):
-            if (
-                name_contains.lower() in dev["name"].lower()
-                and dev["max_input_channels"] > 0
-            ):
+            if dev.get(channel_key, 0) > 0:
                 return idx
-        # Return default input device if not found
-        self.logger.warning(
-            f"No input device found containing '{name_contains}', using default."
+
+        raise RuntimeError(
+            f"No {device_type} audio device with {device_type} channels found."
         )
-        return self._safe_query_device("input")
 
-    def _safe_query_device(self, kind: str) -> int:
-        try:
-            return int(sd.query_devices(None, kind)["index"])
-        except sd.PortAudioError:
-            return int(sd.default.device[1])
+    def get_output_device_id(self, name_contains: str) -> int:
+        """Return the output device id whose name contains the given string (case-insensitive).
+
+        If not found, return the first available output device.
+        """
+        return self._find_device_id(name_contains, "output")
+
+    def get_input_device_id(self, name_contains: str) -> int:
+        """Return the input device id whose name contains the given string (case-insensitive).
+
+        If not found, return the first available input device.
+        """
+        return self._find_device_id(name_contains, "input")
diff --git a/tests/test_audio_methods.py b/tests/test_audio_methods.py
new file mode 100644
index 00000000..c81a3082
--- /dev/null
+++ b/tests/test_audio_methods.py
@@ -0,0 +1,397 @@
+"""Comprehensive audio streaming test comparing write-based vs callback-based methods.
+
+Tests both methods across multiple metrics:
+- Basic functionality (does it work?)
+- Latency (how delayed is the audio?)
+- Quality (any clicks, pops, or gaps?)
+- Stability (does it maintain consistent playback?)
+- CPU usage (resource efficiency)
+"""
+
+import time
+import numpy as np
+import sounddevice as sd
+from queue import Queue, Empty
+from typing import Tuple, Dict, Any
+import threading
+
+
+class AudioTester:
+    """Test audio playback methods."""
+    
+    def __init__(self):
+        """Initialize with reSpeaker device."""
+        self.sample_rate = 16000
+        self.device_id = self._find_respeaker()
+        print(f"Using device ID: {self.device_id}")
+        print(f"Device info: {sd.query_devices(self.device_id)}")
+        
+    def _find_respeaker(self) -> int:
+        """Find reSpeaker output device."""
+        devices = sd.query_devices()
+        for idx, dev in enumerate(devices):
+            if "respeaker" in dev["name"].lower() and dev.get("max_output_channels", 0) > 0:
+                return idx
+        raise RuntimeError("reSpeaker output device not found")
+    
+    def generate_test_tone(self, duration: float = 1.0, frequency: float = 440.0) -> np.ndarray:
+        """Generate a test tone."""
+        t = np.linspace(0, duration, int(self.sample_rate * duration))
+        tone = (np.sin(2 * np.pi * frequency * t) * 0.3).astype(np.float32)
+        return tone
+    
+    def generate_test_sequence(self) -> np.ndarray:
+        """Generate sequence of different tones to test timing."""
+        # Three beeps with silence between them
+        beep1 = self.generate_test_tone(0.2, 440)  # A
+        silence = np.zeros(int(self.sample_rate * 0.1), dtype=np.float32)
+        beep2 = self.generate_test_tone(0.2, 554)  # C#
+        beep3 = self.generate_test_tone(0.2, 659)  # E
+        
+        sequence = np.concatenate([beep1, silence, beep2, silence, beep3])
+        return sequence
+    
+    # ==================== METHOD 1: WRITE-BASED (CURRENT/BROKEN) ====================
+    
+    def test_write_based(self) -> Dict[str, Any]:
+        """Test the current write-based method."""
+        print("\n" + "="*70)
+        print("TEST 1: WRITE-BASED STREAMING (Current Method)")
+        print("="*70)
+        
+        results = {
+            "method": "write-based",
+            "works": False,
+            "latency": None,
+            "errors": [],
+            "notes": []
+        }
+        
+        try:
+            # Create output stream
+            stream = sd.OutputStream(
+                samplerate=self.sample_rate,
+                device=self.device_id,
+                channels=1,
+            )
+            stream.start()
+            
+            print("Stream started...")
+            time.sleep(0.5)
+            
+            # Generate and play test sequence
+            test_audio = self.generate_test_sequence()
+            
+            print(f"Writing {len(test_audio)} samples...")
+            start_time = time.time()
+            
+            # Try to write audio
+            try:
+                stream.write(test_audio)
+                write_time = time.time() - start_time
+                results["latency"] = write_time
+                results["works"] = True
+                print(f"✓ Write completed in {write_time:.3f}s")
+            except Exception as e:
+                results["errors"].append(f"Write failed: {e}")
+                print(f"✗ Write failed: {e}")
+            
+            time.sleep(2)  # Wait to hear if anything plays
+            
+            stream.stop()
+            stream.close()
+            print("Stream closed.")
+            
+        except Exception as e:
+            results["errors"].append(f"Stream creation failed: {e}")
+            print(f"✗ Stream creation failed: {e}")
+        
+        return results
+    
+    # ==================== METHOD 2: CALLBACK-BASED (PROPOSED FIX) ====================
+    
+    def test_callback_based(self) -> Dict[str, Any]:
+        """Test the proposed callback-based method."""
+        print("\n" + "="*70)
+        print("TEST 2: CALLBACK-BASED STREAMING (Proposed Fix)")
+        print("="*70)
+        
+        results = {
+            "method": "callback-based",
+            "works": False,
+            "latency": None,
+            "errors": [],
+            "notes": []
+        }
+        
+        output_queue = Queue()
+        playback_started = threading.Event()
+        first_callback_time = [None]
+        
+        def callback(outdata, frames, time_info, status):
+            """Streaming callback."""
+            if status:
+                results["notes"].append(f"Status: {status}")
+            
+            # Mark when first callback happens
+            if first_callback_time[0] is None:
+                first_callback_time[0] = time.time()
+                playback_started.set()
+            
+            try:
+                data = output_queue.get_nowait()
+                if len(data) >= frames:
+                    outdata[:, 0] = data[:frames]
+                    if len(data) > frames:
+                        output_queue.put(data[frames:])
+                else:
+                    outdata[:len(data), 0] = data
+                    outdata[len(data):, 0] = 0
+            except Empty:
+                outdata[:, 0] = 0  # Output silence if no data
+        
+        try:
+            # Create output stream with callback
+            stream = sd.OutputStream(
+                samplerate=self.sample_rate,
+                device=self.device_id,
+                channels=1,
+                callback=callback,
+                blocksize=1024,
+            )
+            stream.start()
+            
+            print("Stream started with callback...")
+            time.sleep(0.5)
+            
+            # Generate and queue test sequence
+            test_audio = self.generate_test_sequence()
+            
+            print(f"Queueing {len(test_audio)} samples...")
+            queue_start = time.time()
+            output_queue.put(test_audio)
+            
+            # Wait for playback to start
+            if playback_started.wait(timeout=2.0):
+                latency = first_callback_time[0] - queue_start
+                results["latency"] = latency
+                results["works"] = True
+                print(f"✓ Playback started with {latency:.3f}s latency")
+            else:
+                results["errors"].append("Playback did not start")
+                print("✗ Playback did not start within 2 seconds")
+            
+            time.sleep(2)  # Wait to hear full sequence
+            
+            stream.stop()
+            stream.close()
+            print("Stream closed.")
+            
+        except Exception as e:
+            results["errors"].append(f"Stream creation failed: {e}")
+            print(f"✗ Stream creation failed: {e}")
+        
+        return results
+    
+    # ==================== METHOD 3: CALLBACK-BASED (LIKE PLAY_SOUND) ====================
+    
+    def test_callback_file_style(self) -> Dict[str, Any]:
+        """Test callback-based method using the pattern from play_sound()."""
+        print("\n" + "="*70)
+        print("TEST 3: CALLBACK-BASED (play_sound style)")
+        print("="*70)
+        
+        results = {
+            "method": "callback-file-style",
+            "works": False,
+            "latency": None,
+            "errors": [],
+            "notes": []
+        }
+        
+        test_audio = self.generate_test_sequence()
+        start_pos = [0]
+        length = len(test_audio)
+        playback_started = threading.Event()
+        first_callback_time = [None]
+        
+        def callback(outdata, frames, time_info, status):
+            """File-style playback callback."""
+            if status:
+                results["notes"].append(f"Status: {status}")
+            
+            if first_callback_time[0] is None:
+                first_callback_time[0] = time.time()
+                playback_started.set()
+            
+            end = start_pos[0] + frames
+            if end > length:
+                # Fill remaining with audio data and pad with zeros
+                outdata[: length - start_pos[0], 0] = test_audio[start_pos[0] :]
+                outdata[length - start_pos[0] :, 0] = 0
+                raise sd.CallbackStop()
+            else:
+                outdata[:, 0] = test_audio[start_pos[0] : end]
+            start_pos[0] = end
+        
+        try:
+            event = threading.Event()
+            
+            stream_start = time.time()
+            stream = sd.OutputStream(
+                samplerate=self.sample_rate,
+                device=self.device_id,
+                channels=1,
+                callback=callback,
+                finished_callback=event.set,
+            )
+            stream.start()
+            
+            print("Stream started with file-style callback...")
+            
+            # Wait for playback to start
+            if playback_started.wait(timeout=2.0):
+                latency = first_callback_time[0] - stream_start
+                results["latency"] = latency
+                results["works"] = True
+                print(f"✓ Playback started with {latency:.3f}s latency")
+            else:
+                results["errors"].append("Playback did not start")
+                print("✗ Playback did not start")
+            
+            # Wait for completion
+            event.wait(timeout=5.0)
+            time.sleep(0.5)
+            
+            stream.stop()
+            stream.close()
+            print("Stream closed.")
+            
+        except Exception as e:
+            results["errors"].append(f"Failed: {e}")
+            print(f"✗ Failed: {e}")
+        
+        return results
+    
+    # ==================== COMPREHENSIVE TEST RUNNER ====================
+    
+    def run_all_tests(self) -> Dict[str, Any]:
+        """Run all tests and compare results."""
+        print("\n" + "="*70)
+        print("AUDIO STREAMING METHOD COMPARISON TEST")
+        print("="*70)
+        print(f"Sample Rate: {self.sample_rate} Hz")
+        print(f"Device: {sd.query_devices(self.device_id)['name']}")
+        print("\nFor each test, you should listen for THREE BEEPS.")
+        print("After each test, you'll be asked if you heard audio.")
+        print("="*70)
+        
+        results = {}
+        
+        # Test 1: Write-based (current broken method)
+        print("\n\n[TEST 1 of 3] WRITE-BASED METHOD (current implementation)")
+        input("Press ENTER to play test audio...")
+        results["write_based"] = self.test_write_based()
+        time.sleep(3)
+        
+        heard = input("\nDid you hear THREE BEEPS from the robot? (y/n): ").strip().lower()
+        results["write_based"]["user_heard_audio"] = (heard == 'y')
+        
+        # Test 2: Callback-based with queue (proposed fix)
+        print("\n\n[TEST 2 of 3] CALLBACK-BASED WITH QUEUE (proposed fix)")
+        input("Press ENTER to play test audio...")
+        results["callback_queue"] = self.test_callback_based()
+        time.sleep(3)
+        
+        heard = input("\nDid you hear THREE BEEPS from the robot? (y/n): ").strip().lower()
+        results["callback_queue"]["user_heard_audio"] = (heard == 'y')
+        
+        # Test 3: Callback-based file-style (known working)
+        print("\n\n[TEST 3 of 3] CALLBACK FILE-STYLE (like play_sound)")
+        input("Press ENTER to play test audio...")
+        results["callback_file"] = self.test_callback_file_style()
+        time.sleep(3)
+        
+        heard = input("\nDid you hear THREE BEEPS from the robot? (y/n): ").strip().lower()
+        results["callback_file"]["user_heard_audio"] = (heard == 'y')
+        
+        # Print summary
+        self._print_summary(results)
+        
+        return results
+    
+    def _print_summary(self, results: Dict[str, Any]):
+        """Print test summary."""
+        print("\n" + "="*70)
+        print("TEST RESULTS SUMMARY")
+        print("="*70)
+        
+        for test_name, result in results.items():
+            print(f"\n{result['method'].upper()}:")
+            print(f"  User heard audio: {'✓ YES' if result.get('user_heard_audio', False) else '✗ NO'}")
+            print(f"  Technical success: {'✓ YES' if result['works'] else '✗ NO'}")
+            if result['latency']:
+                print(f"  Latency: {result['latency']:.3f}s")
+            if result['errors']:
+                print(f"  Errors: {', '.join(result['errors'])}")
+        
+        print("\n" + "="*70)
+        print("ANALYSIS:")
+        print("="*70)
+        
+        # Count which methods worked
+        heard_write = results["write_based"].get("user_heard_audio", False)
+        heard_queue = results["callback_queue"].get("user_heard_audio", False)
+        heard_file = results["callback_file"].get("user_heard_audio", False)
+        
+        if not heard_write and not heard_queue and not heard_file:
+            print("⚠ NO AUDIO HEARD on any test!")
+            print("  Possible issues:")
+            print("  - Robot speaker not working")
+            print("  - Wrong audio device selected")
+            print("  - Volume too low")
+        elif heard_write:
+            print("✓ Write-based method WORKS on your system")
+            print("  This is unexpected for Windows + USB audio.")
+            print("  The fix may not be necessary, but callback-based is still")
+            print("  more reliable and recommended for cross-platform compatibility.")
+        else:
+            print("✗ Write-based method DOES NOT WORK (expected on Windows)")
+            
+        if heard_queue:
+            print("✓ Queue-based callback method WORKS")
+            print("  ✅ RECOMMENDATION: Apply this fix to audio_sounddevice.py")
+        else:
+            print("✗ Queue-based callback method did not work")
+            print("  This needs investigation before applying fix.")
+            
+        if heard_file:
+            print("✓ File-style callback method WORKS")
+            print("  This confirms the callback approach is viable.")
+        
+        print("\n" + "="*70)
+        print("FINAL RECOMMENDATION:")
+        print("="*70)
+        
+        if heard_queue and not heard_write:
+            print("✅ APPLY THE FIX")
+            print("   The queue-based callback method works while write-based doesn't.")
+            print("   This will enable the conversation app audio on Windows.")
+        elif heard_queue and heard_write:
+            print("⚠ OPTIONAL: Apply fix for better compatibility")
+            print("   Both methods work, but callback-based is more reliable.")
+        elif not heard_queue and heard_file:
+            print("⚠ INVESTIGATE: Queue-based needs debugging")
+            print("   File-style works but queue-based doesn't.")
+        else:
+            print("⚠ DO NOT APPLY FIX YET")
+            print("   Need to investigate why methods aren't working.")
+        
+        print("="*70)
+
+
+if __name__ == "__main__":
+    tester = AudioTester()
+    results = tester.run_all_tests()
+    
+    print("\n✓ Testing complete! See summary above for recommendations.")

From 721b8012f4bd10f2e6191fa2179c5b1d979e2374 Mon Sep 17 00:00:00 2001
From: Daniel Ritchie <daniel@brainwavecollective.ai>
Date: Mon, 3 Nov 2025 19:26:39 -0700
Subject: [PATCH 2/5] align with main

---
 src/reachy_mini/media/audio_sounddevice.py | 138 ++++++++-------------
 1 file changed, 55 insertions(+), 83 deletions(-)

diff --git a/src/reachy_mini/media/audio_sounddevice.py b/src/reachy_mini/media/audio_sounddevice.py
index 81593621..4b354e49 100644
--- a/src/reachy_mini/media/audio_sounddevice.py
+++ b/src/reachy_mini/media/audio_sounddevice.py
@@ -4,7 +4,6 @@
 import threading
 import time
 from collections import deque
-from queue import Queue, Empty
 from typing import Any, List, Optional
 
 import numpy as np
@@ -15,18 +14,20 @@
 
 from reachy_mini.utils.constants import ASSETS_ROOT_PATH
 
-from .audio_base import AudioBackend, AudioBase
+from .audio_base import AudioBase  # NOTE: AudioBackend is NOT present in head
 
 
 class SoundDeviceAudio(AudioBase):
     """Audio device implementation using sounddevice."""
-    
+
     def __init__(
         self,
         frames_per_buffer: int = 1024,
         log_level: str = "INFO",
     ) -> None:
-        super().__init__(backend=AudioBackend.SOUNDDEVICE, log_level=log_level)
+        # audio_base.AudioBase in head takes only log_level
+        super().__init__(log_level=log_level)
+
         self.frames_per_buffer = frames_per_buffer
         self.stream = None
         self._output_stream = None
@@ -36,6 +37,7 @@ def __init__(
         self._output_device_id = self.get_output_device_id("respeaker")
         self._input_device_id = self.get_input_device_id("respeaker")
 
+        # Streaming state (replaces queue/accumulation approach)
         self._streaming_active = False
         self._chunk_fifo: deque[npt.NDArray[np.float32]] = deque()
         self._queued_samples: int = 0
@@ -44,8 +46,11 @@ def __init__(
         self._underflows: int = 0
         self._overflows: int = 0
 
+    # ---------- Input (recording) ----------
+
     def start_recording(self) -> None:
         """Open the audio input stream, using ReSpeaker card if available."""
+        # Make channel/dtype explicit to avoid hidden conversions
         self.stream = sd.InputStream(
             blocksize=self.frames_per_buffer,
             device=self._input_device_id,
@@ -69,7 +74,6 @@ def _callback(
     ) -> None:
         if status:
             self.logger.warning(f"SoundDevice status: {status}")
-
         self._buffer.append(indata.copy())
 
     def get_audio_sample(self) -> Optional[npt.NDArray[np.float32]]:
@@ -89,6 +93,8 @@ def stop_recording(self) -> None:
             self.stream = None
             self.logger.info("SoundDevice audio stream closed.")
 
+    # ---------- Output (streaming TTS/audio) ----------
+
     def push_audio_sample(self, data: npt.NDArray[np.float32]) -> None:
         """Push PCM mono float32 audio into the output FIFO."""
         if not self._streaming_active or self._output_stream is None:
@@ -107,17 +113,10 @@ def push_audio_sample(self, data: npt.NDArray[np.float32]) -> None:
         self._chunk_fifo.append(data)
         self._queued_samples += data.shape[0]
 
-    # --- helper: compute target watermark in samples ---
     def _target_buffer_samples(self) -> int:
+        """Watermark in samples for small prebuffer to smooth bursty input."""
         return int(self.SAMPLE_RATE * (self._target_buffer_ms / 1000.0))
-        
-        # A modest prebuffer (start at 120 ms) smooths bursty input. Tune 80–200 ms depending on your device/OS. Lower = snappier, higher = safer.
-       
-        # Try self._target_buffer_ms = 150 and/or frames_per_buffer = 512 (lower latency) or 2048 (more stable).
-        
-        # FYI: On Windows with ReSpeaker, WASAPI shared mode can be finicky. Try larger frames_per_buffer (e.g., 2048) first; if still choppy, bump _target_buffer_ms to ~200 ms.
-        
-    # --- improved callback: drain multiple chunks + carry-over ---
+
     def _streaming_callback(
         self,
         outdata: npt.NDArray[np.float32],
@@ -125,9 +124,8 @@ def _streaming_callback(
         time: Any,
         status: sd.CallbackFlags,
     ) -> None:
-        # Handle PortAudio status flags
+        # Track under/overflow for diagnostics
         if status:
-            # Track under/over flows for debugging
             if status.input_overflow or status.output_underflow:
                 if status.output_underflow:
                     self._underflows += 1
@@ -135,15 +133,12 @@ def _streaming_callback(
                     self._overflows += 1
                 self.logger.debug(f"Audio status: {status} (uf={self._underflows}, of={self._overflows})")
 
-        # Ensure we don't leave stale data in other channels
-        # outdata shape: (frames, channels)
         out = outdata[:, 0]
-        out[:] = 0.0  # default to silence; we’ll fill what we can
+        out[:] = 0.0  # default to silence
 
-        # Keep a small prebuffer to avoid stutter right after starts or small gaps
         target = self._target_buffer_samples()
 
-        # ---- read from tail first
+        # 1) Drain carry-over tail first
         written = 0
         if self._tail is not None and self._tail.size:
             take = min(frames, self._tail.size)
@@ -154,14 +149,13 @@ def _streaming_callback(
             else:
                 self._tail = None
 
-        # ---- then drain FIFO until we fill this block
+        # 2) Drain FIFO; allow multiple chunks to fill the block
         while written < frames:
             try:
                 if self._queued_samples < target:
-                    # Not enough buffered audio—leave remaining zeros (silence)
+                    # Not enough buffered audio—keep remaining zeros
                     break
 
-                # Pull next chunk from local FIFO (not the Queue; producer pushes here)
                 chunk = self._chunk_fifo.popleft()
                 self._queued_samples -= chunk.shape[0]
 
@@ -170,17 +164,14 @@ def _streaming_callback(
                     out[written:written + chunk.shape[0]] = chunk
                     written += chunk.shape[0]
                 else:
-                    # Partially consume; keep remainder as tail
                     out[written:frames] = chunk[:need]
-                    self._tail = chunk[need:]  # carry over to next callback
+                    self._tail = chunk[need:]  # carry remainder to next callback
                     written = frames
-
             except IndexError:
-                # FIFO empty
-                break
+                break  # FIFO empty
 
-    # --- start_playing: reset tracking, explicit dtype ---
     def start_playing(self) -> None:
+        """Open the audio output stream (callback mode with smoothing)."""
         self._streaming_active = True
         self._chunk_fifo.clear()
         self._queued_samples = 0
@@ -202,6 +193,7 @@ def start_playing(self) -> None:
         self.logger.info("SoundDevice audio output stream opened (callback mode w/ smoothing).")
 
     def stop_playing(self) -> None:
+        """Close the audio output stream."""
         self._streaming_active = False
         if self._output_stream is not None:
             try:
@@ -214,6 +206,8 @@ def stop_playing(self) -> None:
         self._tail = None
         self.logger.info("SoundDevice audio output stream closed.")
 
+    # ---------- One-shot file playback (unchanged, but explicit mono) ----------
+
     def play_sound(self, sound_file: str, autoclean: bool = False) -> None:
         """Play a sound file from the assets directory or a given path using sounddevice and soundfile."""
         file_path = f"{ASSETS_ROOT_PATH}/{sound_file}"
@@ -232,22 +226,20 @@ def play_sound(self, sound_file: str, autoclean: bool = False) -> None:
         self.logger.debug(f"Playing sound '{file_path}' at {samplerate_in} Hz")
 
         self.stop_playing()
-        start = [0]  # using list to modify in callback
+        start = [0]
         length = len(data)
 
         def callback(
             outdata: npt.NDArray[np.float32],
             frames: int,
-            time: Any,  # cdata 'struct PaStreamCallbackTimeInfo *
+            time: Any,
             status: sd.CallbackFlags,
         ) -> None:
-            """Actual playback."""
             if status:
                 self.logger.warning(f"SoundDevice output status: {status}")
 
             end = start[0] + frames
             if end > length:
-                # Fill the output buffer with the audio data, or zeros if finished
                 outdata[: length - start[0], 0] = data[start[0] :]
                 outdata[length - start[0] :, 0] = 0
                 raise sd.CallbackStop()
@@ -262,19 +254,15 @@ def callback(
             device=self._output_device_id,
             channels=1,
             callback=callback,
-            finished_callback=event.set,  # release the device when done
+            finished_callback=event.set,
         )
         if self._output_stream is None:
             raise RuntimeError("Failed to open SoundDevice audio output stream.")
         self._output_stream.start()
 
         def _clean_up_thread() -> None:
-            """Thread to clean up the output stream after playback.
-
-            The daemon may play sound but should release the audio device.
-            """
             event.wait()
-            timeout = 5  # seconds
+            timeout = 5
             waited = 0
             while (
                 self._output_stream is not None
@@ -286,61 +274,45 @@ def _clean_up_thread() -> None:
             self.stop_playing()
 
         if autoclean:
-            threading.Thread(
-                target=_clean_up_thread,
-                daemon=True,
-            ).start()
+            threading.Thread(target=_clean_up_thread, daemon=True).start()
 
-    def _find_device_id(
-        self, name_contains: str, device_type: str
-    ) -> int:
-        """Find device ID by name and type with fallback logic.
+    # ---------- Device selection (kept compatible with head) ----------
 
-        Args:
-            name_contains: Substring to search for in device name (case-insensitive)
-            device_type: Either "input" or "output"
-
-        Returns:
-            Device index
-
-        Raises:
-            RuntimeError: If no device with appropriate channels found
+    def get_output_device_id(self, name_contains: str) -> int:
+        """Return the output device id whose name contains the given string (case-insensitive).
+        If not found, return the default output device id.
         """
         devices = sd.query_devices()
-        channel_key = f"max_{device_type}_channels"
-
-        # First try: Search for device by specific name (e.g., "respeaker")
         for idx, dev in enumerate(devices):
             if (
                 name_contains.lower() in dev["name"].lower()
-                and dev.get(channel_key, 0) > 0
+                and dev["max_output_channels"] > 0
             ):
                 return idx
-
-        # Log warning if device with specific name not found
         self.logger.warning(
-            f"No {device_type} device containing '{name_contains}' found. Using first available {device_type} device."
-        )
-
-        # Fallback: Return first device with appropriate channels
-        for idx, dev in enumerate(devices):
-            if dev.get(channel_key, 0) > 0:
-                return idx
-
-        raise RuntimeError(
-            f"No {device_type} audio device with {device_type} channels found."
+            f"No output device found containing '{name_contains}', using default."
         )
-
-    def get_output_device_id(self, name_contains: str) -> int:
-        """Return the output device id whose name contains the given string (case-insensitive).
-
-        If not found, return the first available output device.
-        """
-        return self._find_device_id(name_contains, "output")
+        return self._safe_query_device("output")
 
     def get_input_device_id(self, name_contains: str) -> int:
         """Return the input device id whose name contains the given string (case-insensitive).
-
-        If not found, return the first available input device.
+        If not found, return the default input device id.
         """
-        return self._find_device_id(name_contains, "input")
+        devices = sd.query_devices()
+        for idx, dev in enumerate(devices):
+            if (
+                name_contains.lower() in dev["name"].lower()
+                and dev["max_input_channels"] > 0
+            ):
+                return idx
+        self.logger.warning(
+            f"No input device found containing '{name_contains}', using default."
+        )
+        return self._safe_query_device("input")
+
+    def _safe_query_device(self, kind: str) -> int:
+        try:
+            return int(sd.query_devices(None, kind)["index"])
+        except sd.PortAudioError:
+            # Fallback: sd.default.device = (input, output)
+            return int(sd.default.device[1 if kind == "output" else 0])

From 7a696ff00332fbdfc2508e2f9443f0fed77b390f Mon Sep 17 00:00:00 2001
From: Daniel Ritchie <daniel@brainwavecollective.ai>
Date: Mon, 3 Nov 2025 20:00:21 -0700
Subject: [PATCH 3/5] removed unrelated file

---
 my_local_files.txt | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 my_local_files.txt

diff --git a/my_local_files.txt b/my_local_files.txt
deleted file mode 100644
index 2e9418aa..00000000
--- a/my_local_files.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-
-src/reachy_mini/media/audio_sounddevice.py
-tests/test_audio_methods.py

From dff9fb4bebeb83cdcfebc711fca8887907412947 Mon Sep 17 00:00:00 2001
From: Daniel Ritchie <daniel@brainwavecollective.ai>
Date: Tue, 4 Nov 2025 23:21:15 -0700
Subject: [PATCH 4/5] always mono

---
 src/reachy_mini/media/audio_sounddevice.py | 23 ++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/reachy_mini/media/audio_sounddevice.py b/src/reachy_mini/media/audio_sounddevice.py
index 4b354e49..a6717b77 100644
--- a/src/reachy_mini/media/audio_sounddevice.py
+++ b/src/reachy_mini/media/audio_sounddevice.py
@@ -96,22 +96,25 @@ def stop_recording(self) -> None:
     # ---------- Output (streaming TTS/audio) ----------
 
     def push_audio_sample(self, data: npt.NDArray[np.float32]) -> None:
-        """Push PCM mono float32 audio into the output FIFO."""
         if not self._streaming_active or self._output_stream is None:
             self.logger.warning("Output stream is not active. Call start_playing() first.")
             return
 
-        # Ensure shape (n,) float32 mono
-        if data.ndim == 2:
-            if data.shape[1] > 1:
-                data = np.mean(data, axis=1)
+        a = np.asarray(data, dtype=np.float32, order="C")
+
+        # Accept (N,), (1,N), (N,1), (C,N), (N,C)
+        if a.ndim == 2:
+            if 1 in a.shape:
+                a = a.reshape(-1)                        # (1,N) or (N,1) -> (N,)
             else:
-                data = data[:, 0]
-        data = np.asarray(data, dtype=np.float32, order="C")
+                chan_axis = 0 if a.shape[0] <= a.shape[1] else 1  # smaller dim = channels
+                a = a.mean(axis=chan_axis)              # (C,N) or (N,C) -> (N,)
+        elif a.ndim > 2:
+            a = a.reshape(-1)
+
+        self._chunk_fifo.append(a)
+        self._queued_samples += int(a.shape[0])
 
-        # Push into local FIFO; keep sample count
-        self._chunk_fifo.append(data)
-        self._queued_samples += data.shape[0]
 
     def _target_buffer_samples(self) -> int:
         """Watermark in samples for small prebuffer to smooth bursty input."""

From fcba73c8dedeccd6f4615903dcf5dfc081cbac89 Mon Sep 17 00:00:00 2001
From: Daniel Ritchie <daniel@brainwavecollective.ai>
Date: Thu, 27 Nov 2025 00:12:36 -0700
Subject: [PATCH 5/5] mv per Fabien

---
 {tests => examples/debug}/test_audio_methods.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {tests => examples/debug}/test_audio_methods.py (100%)

diff --git a/tests/test_audio_methods.py b/examples/debug/test_audio_methods.py
similarity index 100%
rename from tests/test_audio_methods.py
rename to examples/debug/test_audio_methods.py