Merge remote-tracking branch 'upstream/main'

qanastek · qanastek · commit cc18cafd16ab · 2026-03-13T15:36:26.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -39,6 +39,7 @@ jobs:
           pytest -v --cov-report=xml --cov=src/sdialog
 
       - name: Upload coverage reports to Codecov
+        if: matrix.python-version == '3.10'
         uses: codecov/codecov-action@v5
         with:
           fail_ci_if_error: false
diff --git a/docs/examples/index.rst b/docs/examples/index.rst
@@ -90,6 +90,59 @@ Let's start with something fun and straightforward—creating a simple dialogue
 
 Individual agents can be served and exposed as a OpenAI compatible API endpoint with the :meth:`~sdialog.agents.Agent.serve` method (e.g. ``mentor.serve(port=1333)``), see :ref:`here <serving_agents>` for more details.
 
+.. _ex-agent-tools:
+
+Agent Tools (Function Calling)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+You can attach plain Python functions as tools. When the backend supports tool/function calling,
+the agent can call them during response generation.
+
+.. code-block:: python
+
+    import sdialog
+    from sdialog.agents import Agent
+
+    sdialog.config.llm("openai:gpt-4.1")
+
+    def get_weather(city: str) -> dict:
+        """Return weather information for a city."""
+        return {"city": city, "temperature_c": 21, "condition": "sunny"}
+
+    assistant = Agent(
+        name="WeatherAssistant",
+        tools=[get_weather],
+        system_prompt="Use tools when needed and answer concisely."
+    )
+
+    print(assistant("What's the weather in Geneva?"))
+
+.. _ex-final-response-tool:
+
+Direct Tool Output with ``@final_response_tool``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If a tool returns a pre-formatted result (e.g., a markdown table), you can mark it with
+``@final_response_tool`` so the agent returns the tool output directly as the final response.
+
+This is especially useful when the tool already produces exactly the text you want the user to see.
+Without the decorator, the LLM would typically read the tool output and generate a new answer from it,
+which may add extra wording, reformat the content, or spend unnecessary tokens reproducing a large block
+of structured text. With ``@final_response_tool``, the tool output becomes the final answer directly.
+
+.. code-block:: python
+
+    from sdialog.agents import Agent, final_response_tool
+
+    @final_response_tool
+    def get_report_table(topic: str) -> str:
+        return "| Item | Value |\n|---|---|\n| example | 42 |"
+
+    agent = Agent(tools=[get_report_table])
+
+Notes:
+
+- Non-empty tool output is returned directly as the agent final answer.
+- Empty tool output falls back to regular tool flow (the LLM can continue and synthesize a response).
+
 Few-Shot Learning with Example Dialogs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Now let's explore one of SDialog's most powerful features! We can guide our dialogues by providing examples that show the system what style, structure, or format we want. This technique, called few-shot learning, works by supplying ``example_dialogs`` to generation components. These exemplar dialogs are injected into the system prompt to steer tone, task format, and conversation flow.
diff --git a/requirements-audio-test.txt b/requirements-audio-test.txt
@@ -4,6 +4,5 @@ sox
 jams
 pyloudnorm
 pyroomacoustics
-datasets<=3.6.0
 huggingface_hub[cli]
 dscaper>=1.7.7
diff --git a/src/sdialog/audio/dialog.py b/src/sdialog/audio/dialog.py
@@ -490,7 +490,7 @@ def persona_to_voice(
         persona_to_voice_desc: Union[str, callable] = None,
         voices: dict[Role, Union[Voice, tuple[str, str]]] = None,
         keep_duplicate: bool = False,
-        tts_engine: BaseTTS | BaseVoiceCloneTTS = None,
+        tts_engine: Union[BaseTTS, BaseVoiceCloneTTS] = None,
         seed: int = None
     ) -> None:
         """
diff --git a/src/sdialog/audio/tts/qwen3/tts.py b/src/sdialog/audio/tts/qwen3/tts.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: MIT
 import torch
 import numpy as np
+from typing import Optional
 
 from ..base import BaseTTS, BaseVoiceCloneTTS
 from sdialog.audio.normalizers import TextNormalizer, UnicodeToAsciiNormalizer, normalize_text
@@ -176,7 +177,7 @@ def __init__(
 
     def generate(self,
                  text: str,
-                 speaker_voice: str | object = None,
+                 speaker_voice: Optional[object] = None,
                  tts_pipeline_kwargs: dict = {}) -> tuple[np.ndarray, int]:
         """
         Generates audio from text using voice cloning.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,129 @@
+import sys
+import types
+import importlib.machinery
+from types import SimpleNamespace
+
+import numpy as np
+
+
+def _install_qwen_tts_stub() -> None:
+    try:
+        __import__("qwen_tts")
+        return
+    except ImportError:
+        pass
+
+    class _FakeQwen3TTSModel:
+        def __init__(self, *args, **kwargs):
+            self.args = args
+            self.kwargs = kwargs
+
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            return cls(*args, **kwargs)
+
+        def generate_custom_voice(self, text, speaker=None, **kwargs):
+            return [np.zeros(24_000, dtype=np.float32)], 24_000
+
+        def generate_voice_clone(self, text, **kwargs):
+            return [np.zeros(24_000, dtype=np.float32)], 24_000
+
+        def generate_voice_design(self, text, language=None, instruct=None, **kwargs):
+            return [np.zeros(24_000, dtype=np.float32)], 24_000
+
+        def create_voice_clone_prompt(self, ref_audio=None, ref_text=None, **kwargs):
+            return {
+                "ref_audio": ref_audio,
+                "ref_text": ref_text,
+            }
+
+    qwen_tts_module = types.ModuleType("qwen_tts")
+    qwen_tts_module.Qwen3TTSModel = _FakeQwen3TTSModel
+    qwen_tts_module.__spec__ = importlib.machinery.ModuleSpec("qwen_tts", loader=None)
+    sys.modules["qwen_tts"] = qwen_tts_module
+
+
+_install_qwen_tts_stub()
+
+
+def _install_torchcodec_stub() -> None:
+    try:
+        __import__("torchcodec")
+        return
+    except ImportError:
+        pass
+
+    class _FakeTensor:
+        def __init__(self, array):
+            self._array = array
+
+        def cpu(self):
+            return self
+
+        def numpy(self):
+            return self._array
+
+    class _FakeAudioSamples:
+        def __init__(self, data=None, sample_rate: int = 16_000):
+            _arr = np.zeros((1, sample_rate), dtype=np.float32) if data is None else data
+            self.data = _FakeTensor(_arr)
+            self.sample_rate = sample_rate
+
+    class _FakeAudioDecoder:
+        def __init__(self, source=None, *args, **kwargs):
+            self.source = source
+            self.args = args
+            self.kwargs = kwargs
+            _path = None
+            if isinstance(source, dict):
+                _path = source.get("path")
+            else:
+                _path = getattr(source, "path", None)
+
+            self.metadata = SimpleNamespace(
+                sample_rate=16_000,
+                path=_path,
+            )
+
+        def __getitem__(self, key: str):
+            if key == "path":
+                return self.metadata.path
+            if key == "sampling_rate":
+                return self.metadata.sample_rate
+            if key == "array":
+                y = self.get_all_samples().data.cpu().numpy()
+                return np.mean(y, axis=tuple(range(y.ndim - 1))) if y.ndim > 1 else y
+            raise KeyError(key)
+
+        def get_all_samples(self):
+            return _FakeAudioSamples()
+
+        def get_samples_played_in_range(self, *_args, **_kwargs):
+            return SimpleNamespace(sample_rate=self.metadata.sample_rate)
+
+    torchcodec_module = types.ModuleType("torchcodec")
+    decoders_module = types.ModuleType("torchcodec.decoders")
+    decoders_module.AudioDecoder = _FakeAudioDecoder
+    torchcodec_module.decoders = decoders_module
+    torchcodec_module.__spec__ = importlib.machinery.ModuleSpec("torchcodec", loader=None)
+    decoders_module.__spec__ = importlib.machinery.ModuleSpec("torchcodec.decoders", loader=None)
+
+    sys.modules["torchcodec"] = torchcodec_module
+    sys.modules["torchcodec.decoders"] = decoders_module
+
+    # transformers.audio_utils (and others) call importlib.metadata.version("torchcodec")
+    # at module level.  That call bypasses sys.modules and reads on-disk dist-info, so
+    # it raises PackageNotFoundError even though our stub is in sys.modules, causing a
+    # cascade failure across ALL tests.  Patch it to return a harmless version string.
+    import importlib.metadata as _imeta
+    _real_version = _imeta.version
+
+    def _patched_version(name: str) -> str:
+        if name == "torchcodec":
+            return "0.0.0"
+        return _real_version(name)
+
+    _imeta.version = _patched_version
+
+
+_install_torchcodec_stub()
diff --git a/tests/test_audio.py b/tests/test_audio.py
@@ -5,35 +5,22 @@
 import numpy as np
 import pandas as pd
 
-# Try to import audio dependencies
-try:
-    import soundfile as sf
-
-    from sdialog.audio.turn import AudioTurn
-    from sdialog.audio.room_generator import BasicRoomGenerator
-    from sdialog.audio.utils import Role, Furniture, SpeakerSide
-    from sdialog.audio.room import Position3D, Dimensions3D, DirectivityType, Room
-    from sdialog.audio.voice_database import Voice, is_a_audio_file
-    from sdialog.audio.voice_database import BaseVoiceDatabase, LocalVoiceDatabase, VoiceDatabase
-    from sdialog.audio.tts import BaseTTS
-    from sdialog.audio.jsalt import MedicalRoomGenerator, RoomRole
-    from sdialog.audio.acoustics_simulator import AcousticsSimulator, AudioSource
-    from sdialog.audio.dialog import AudioDialog
-    from sdialog.audio.pipeline import AudioPipeline, to_audio
-    from sdialog.audio.dscaper_utils import send_utterances_to_dscaper, generate_dscaper_timeline
-    from sdialog.audio.impulse_response_database import LocalImpulseResponseDatabase, RecordingDevice
-    from sdialog.audio.processing import AudioProcessor
-except ImportError:
-    print("\n" + "=" * 80)
-    print("Audio dependencies are not installed. All audio tests will be skipped.")
-    print("=" * 80 + "\n")
-
-    # Skip the entire module - pytest will not collect any tests from this file
-    pytest.skip(
-        "Audio dependencies not installed. If you are working with audio, install them with: "
-        "pip install sdialog[audio]",
-        allow_module_level=True
-    )
+import soundfile as sf
+
+from sdialog.audio.turn import AudioTurn
+from sdialog.audio.room_generator import BasicRoomGenerator
+from sdialog.audio.utils import Role, Furniture, SpeakerSide
+from sdialog.audio.room import Position3D, Dimensions3D, DirectivityType, Room
+from sdialog.audio.voice_database import Voice, is_a_audio_file
+from sdialog.audio.voice_database import BaseVoiceDatabase, LocalVoiceDatabase, VoiceDatabase
+from sdialog.audio.tts import BaseTTS
+from sdialog.audio.jsalt import MedicalRoomGenerator, RoomRole
+from sdialog.audio.acoustics_simulator import AcousticsSimulator, AudioSource
+from sdialog.audio.dialog import AudioDialog
+from sdialog.audio.pipeline import AudioPipeline, to_audio
+from sdialog.audio.dscaper_utils import send_utterances_to_dscaper, generate_dscaper_timeline
+from sdialog.audio.impulse_response_database import LocalImpulseResponseDatabase, RecordingDevice
+from sdialog.audio.processing import AudioProcessor
 
 from sdialog import Turn, Dialog
 from unittest.mock import MagicMock, patch