Clean up voice session handling

zehnm · zehnm · commit 833a3806407d · 2025-12-15T17:09:36.000+01:00
Verify AudioConfiguration in session from voice_start command.
This requires firmware 2.8.2 or newer.
diff --git a/tests/test_voice_assistant.py b/tests/test_voice_assistant.py
@@ -0,0 +1,71 @@
+import unittest
+
+from ucapi.voice_assistant import (
+    DEFAULT_AUDIO_CHANNELS,
+    DEFAULT_SAMPLE_FORMAT,
+    DEFAULT_SAMPLE_RATE,
+    AudioConfiguration,
+    SampleFormat,
+)
+from ucapi.proto import ucr_integration_voice_pb2 as pb2
+
+
+class TestVoiceAssistantConversions(unittest.TestCase):
+    def test_sample_format_from_proto_supported(self):
+        self.assertEqual(SampleFormat.from_proto(pb2.I16), SampleFormat.I16)
+        self.assertEqual(SampleFormat.from_proto(int(pb2.U32)), SampleFormat.U32)
+        self.assertEqual(SampleFormat.from_proto("f32"), SampleFormat.F32)
+
+    def test_sample_format_from_proto_unsupported_to_none(self):
+        # Values that do not exist in local enum should map to None
+        self.assertIsNone(SampleFormat.from_proto(pb2.I8))
+        self.assertIsNone(SampleFormat.from_proto(pb2.U8))
+        self.assertIsNone(SampleFormat.from_proto("i8"))
+        self.assertIsNone(SampleFormat.from_proto("unknown"))
+
+    def test_audio_cfg_from_proto_message(self):
+        msg = pb2.AudioConfiguration(
+            channels=2,
+            sample_rate=22050,
+            sample_format=pb2.I32,
+            format=pb2.PCM,
+        )
+        cfg = AudioConfiguration.from_proto(msg)
+
+        self.assertIsInstance(cfg, AudioConfiguration)
+        self.assertEqual(cfg.channels, 2)
+        self.assertEqual(cfg.sample_rate, 22050)
+        self.assertEqual(cfg.sample_format, SampleFormat.I32)
+
+    def test_audio_cfg_from_dict_and_string_coercions(self):
+        cfg = AudioConfiguration.from_proto(
+            {
+                "channels": "2",
+                "sample_rate": "16000",
+                "sample_format": "u16",
+            }
+        )
+        self.assertEqual(cfg.channels, 2)
+        self.assertEqual(cfg.sample_rate, 16000)
+        self.assertEqual(cfg.sample_format, SampleFormat.U16)
+
+    def test_audio_cfg_defaults_and_unsupported_sample_format(self):
+        # Provide garbage values to trigger defaults
+        cfg = AudioConfiguration.from_proto(
+            {
+                "channels": "x",
+                "sample_rate": "",
+                "sample_format": pb2.U8,  # unsupported in local enum
+            }
+        )
+        self.assertEqual(cfg.channels, DEFAULT_AUDIO_CHANNELS)
+        self.assertEqual(cfg.sample_rate, DEFAULT_SAMPLE_RATE)
+        # Unsupported sample_format falls back to default
+        self.assertEqual(cfg.sample_format, DEFAULT_SAMPLE_FORMAT)
+
+    def test_audio_cfg_from_proto_none(self):
+        self.assertIsNone(AudioConfiguration.from_proto(None))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/ucapi/api.py b/ucapi/api.py
@@ -449,20 +449,13 @@ async def _on_remote_voice_begin(self, websocket, msg: RemoteVoiceBegin) -> None
         """
         if self._voice_handler is None:
             # Log once per stream and ignore further binary messages.
-            cfg = getattr(msg, "configuration", None)
             _LOG.warning(
-                "[%s] proto VoiceBegin: session_id=%s cfg(ch=%s sr=%s fmt=%s af=%s) (no voice handler)",
+                "[%s] proto VoiceBegin: no voice handler registered! Ignoring voice stream",
                 websocket.remote_address,
-                getattr(msg, "session_id", None),
-                getattr(cfg, "channels", None) if cfg else None,
-                getattr(cfg, "sample_rate", None) if cfg else None,
-                getattr(cfg, "sample_format", None) if cfg else None,
-                getattr(cfg, "format", None) if cfg else None,
             )
             return
 
         session_id = int(getattr(msg, "session_id", 0) or 0)
-        session_id = 0  # FIXME(voice) until core is fixed
         session = self._voice_sessions.get(session_id)
         if not session:
             _LOG.error(
@@ -472,13 +465,15 @@ async def _on_remote_voice_begin(self, websocket, msg: RemoteVoiceBegin) -> None
             )
             return
 
-        # TODO(voice) verify AudioConfiguration in session from voice_start command?
-        # cfg = getattr(msg, "configuration", None)
-        # audio_cfg = AudioConfiguration(
-        #     channels=int(getattr(cfg, "channels", 1) or 1),
-        #     sample_rate=int(getattr(cfg, "sample_rate", 0) or 0),
-        #     sample_format=int(getattr(cfg, "sample_format", 0) or 0), # FIXME convert
-        # )
+        # verify AudioConfiguration in session from voice_start command
+        cfg = getattr(msg, "configuration", None)
+        audio_cfg = AudioConfiguration.from_proto(cfg) or AudioConfiguration()
+        if audio_cfg != session.config:
+            _LOG.error(
+                "[%s] proto VoiceBegin: audio cfg does not match voice_start",
+                websocket.remote_address,
+            )
+            return
 
         # Track ownership for cleanup on disconnect
         owners = self._voice_ws_sessions.setdefault(websocket, set())
@@ -510,7 +505,6 @@ async def _on_remote_voice_data(self, websocket, msg: RemoteVoiceData) -> None:
             return
 
         session_id = int(getattr(msg, "session_id", 0) or 0)
-        session_id = 0  # FIXME(voice) until core is fixed
         session = self._voice_sessions.get(session_id)
         if not session:
             _LOG.error(
@@ -540,7 +534,6 @@ async def _on_remote_voice_end(self, _websocket, msg: RemoteVoiceEnd) -> None:
         if self._voice_handler is None:
             return
         session_id = int(getattr(msg, "session_id", 0) or 0)
-        session_id = 0  # FIXME(voice) until core is fixed
         await self._cleanup_voice_session(session_id)
 
     async def _cleanup_voice_session(
@@ -861,7 +854,6 @@ async def _entity_command(
         ):
             params = msg_data["params"]
             session_id = params.get("session_id")
-            session_id = 0  # FIXME(voice) until core is fixed
             cfg = params.get("audio_cfg")
             audio_cfg = (
                 AudioConfiguration(
diff --git a/ucapi/voice_assistant.py b/ucapi/voice_assistant.py
@@ -16,9 +16,18 @@
 from enum import Enum
 from typing import Any, Optional
 
+# Import specific enum constants to avoid pylint no-member on dynamic attributes
 from ucapi.api_definitions import CommandHandler
 from ucapi.entity import Entity, EntityTypes
 
+from ucapi.proto.ucr_integration_voice_pb2 import (  # pylint: disable=no-name-in-module # isort:skip # noqa
+    F32 as PB_F32,
+    I16 as PB_I16,
+    I32 as PB_I32,
+    U16 as PB_U16,
+    U32 as PB_U32,
+)
+
 DEFAULT_AUDIO_CHANNELS = 1
 DEFAULT_SAMPLE_RATE = 16000
 
@@ -97,6 +106,48 @@ class SampleFormat(str, Enum):
     F32 = "F32"
     """Float 32 bit."""
 
+    @classmethod
+    def from_proto(cls, value: Any) -> Optional["SampleFormat"]:
+        """Convert protobuf enum ``SampleFormat`` to Python enum.
+
+        Returns ``None`` when the value is unknown or not available in this
+        Python enum (e.g., ``SAMPLE_FORMAT_UNKNOWN``, ``I8``, ``U8``).
+
+        Accepts the following inputs:
+        - Protobuf enum value (``pb2.SampleFormat``)
+        - Integer value of the protobuf enum
+        - String value (e.g., "I16", "U32")
+        - ``None``
+        """
+        if value is None:
+            return None
+
+        # Map protobuf values (or their ints) to our Python enum
+        mapping: dict[int, SampleFormat] = {
+            int(PB_I16): cls.I16,
+            int(PB_I32): cls.I32,
+            int(PB_U16): cls.U16,
+            int(PB_U32): cls.U32,
+            int(PB_F32): cls.F32,
+        }
+
+        if isinstance(value, int):
+            return mapping.get(int(value))
+
+        if isinstance(value, str):
+            key = value.strip().upper()
+            # Only map to values that exist in this Python enum
+            try:
+                return cls[key]
+            except KeyError:
+                return None
+
+        # Fallback for enum-like types (protobuf enum wrappers behave like ints)
+        try:
+            return mapping.get(int(value))
+        except (TypeError, ValueError):
+            return None
+
 
 DEFAULT_SAMPLE_FORMAT = SampleFormat.I16
 
@@ -121,6 +172,69 @@ class AudioConfiguration:
     sample_format: SampleFormat = DEFAULT_SAMPLE_FORMAT
     """Audio sample format."""
 
+    @staticmethod
+    def _to_int(value: Any, default: int) -> int:
+        """Best-effort conversion to ``int`` with a sensible default.
+
+        Accepts ``int``/``str``/``None`` and returns ``default`` if conversion
+        fails or value is falsy.
+        """
+        if value is None:
+            return default
+        try:
+            if isinstance(value, bool):  # avoid bool being a subclass of int
+                return default
+            if isinstance(value, (int,)):
+                return int(value) or default
+            if isinstance(value, str):
+                s = value.strip()
+                return int(s) if s else default
+        except (TypeError, ValueError):
+            return default
+        return default
+
+    @classmethod
+    def from_proto(cls, value: Any) -> Optional["AudioConfiguration"]:
+        """Convert protobuf ``AudioConfiguration`` (or mapping) to Python model.
+
+        - ``None`` returns ``None``
+        - Protobuf message: reads fields and converts types
+        - ``dict``/``mapping``: accepts keys ``channels``, ``sample_rate``,
+          ``sample_format`` (strings/ints acceptable)
+
+        The protobuf field ``format`` (``AudioFormat``) is currently ignored in
+        the Python model.
+        """
+        if value is None:
+            return None
+
+        # Extract raw field values from either a proto message or a dict-like
+        if (
+            hasattr(value, "__class__")
+            and value.__class__.__name__ == "AudioConfiguration"
+        ):
+            # Likely a protobuf message instance
+            ch = getattr(value, "channels", DEFAULT_AUDIO_CHANNELS)
+            sr = getattr(value, "sample_rate", DEFAULT_SAMPLE_RATE)
+            sf = getattr(value, "sample_format", None)
+        elif isinstance(value, dict):
+            ch = value.get("channels", DEFAULT_AUDIO_CHANNELS)
+            sr = value.get("sample_rate", DEFAULT_SAMPLE_RATE)
+            sf = value.get("sample_format", None)
+        else:
+            # Unsupported type
+            return None
+
+        channels = cls._to_int(ch, DEFAULT_AUDIO_CHANNELS)
+        sample_rate = cls._to_int(sr, DEFAULT_SAMPLE_RATE)
+        sample_format = SampleFormat.from_proto(sf) or DEFAULT_SAMPLE_FORMAT
+
+        return cls(
+            channels=channels,
+            sample_rate=sample_rate,
+            sample_format=sample_format,
+        )
+
 
 @dataclass(slots=True)
 class VoiceAssistantProfile: