Merge pull request #12 from kadirnar/fix-checkpoint-vui

kadirnar · web-flow · commit 7b3a83444db4 · 2025-06-19T21:07:01.000+03:00
fix: standardize output file naming and fix model loading in VUI inference
diff --git a/README.md b/README.md
@@ -15,20 +15,54 @@ uv pip install voicehub
 
 ## 📚 Usage
 
+VoiceHub provides a simple, unified interface for working with various Text-to-Speech (TTS) models. Below are examples showing how to use different supported TTS models with the same consistent approach.
+
+### OrpheusTTS Model
+
 ```python
 from voicehub.automodel import AutoInferenceModel
 
-# Create model using the static from_pretrained method
 model = AutoInferenceModel.from_pretrained(
     model_type="orpheustts",  # or "dia" or "vui"
     model_path="canopylabs/orpheus-3b-0.1-ft",
     device="cuda",
 )
 
-# Generate speech with the model
+output = model("Hello, how are you today?", voice="tara", output_file="output.wav")
+```
+
+### DiaTTS Model
+
+```python
+from voicehub.automodel import AutoInferenceModel
+
+model = AutoInferenceModel.from_pretrained(
+    model_type="dia",  # or "dia" or "vui"
+    model_path="dia/dia-100m-base.pt",
+    device="cuda",
+)
+
 output = model(
-    "Hello, how are you today?", voice="tara", output_file="output"
-)  # voice param is only for orpheustts
+    text="Hey, here is some random stuff, the text the less likely the model can cope!",
+    output_file="output.wav",
+)
+```
+
+### VuiTTS Model
+
+```python
+from voicehub.automodel import AutoInferenceModel
+
+model = AutoInferenceModel.from_pretrained(
+    model_type="vui",  # or "dia" or "vui"
+    model_path="vui-100m-base.pt",
+    device="cuda",
+)
+
+output = model(
+    text="Hey, here is some random stuff, the text the less likely the model can cope!",
+    output_file="output.wav",
+)
 ```
 
 ## 🤗 Contributing
@@ -43,4 +77,4 @@ pre-commit run --all-files
 
 - [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS)
 - [Dia](https://github.com/nari-labs/dia)
-- [VUI](https://github.com/fluxions-ai/vui)
+- [Vui](https://github.com/fluxions-ai/vui)
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,6 @@ torchaudio
 pydantic
 descript-audio-codec
 soundfile
+inflect
+pandas
+pyannote.audio
diff --git a/voicehub/models/orpheustts/inference.py b/voicehub/models/orpheustts/inference.py
@@ -145,7 +145,7 @@ def _postprocess_tokens(self, generated_ids: torch.Tensor) -> list:
 
         return adjusted
 
-    def __call__(self, prompt: str, voice: str, output_file: str = "sample"):
+    def __call__(self, prompt: str, voice: str, output_file: str = "output.wav"):
         """
         Generate speech from text prompts.
 
@@ -181,7 +181,7 @@ def __call__(self, prompt: str, voice: str, output_file: str = "sample"):
         audio = self._redistribute_codes(codes)
         # Save as 24kHz WAV file
         sf.write(
-            f"{output_file}.wav",
+            f"{output_file}",
             audio.detach().squeeze().cpu().numpy(),
             24000,
         )
diff --git a/voicehub/models/vui/inference.py b/voicehub/models/vui/inference.py
@@ -1,19 +1,22 @@
 import torchaudio
 
-from voicehub.models.vui.inference import render
 from voicehub.models.vui.model import Vui
+from voicehub.models.vui.tts import render
 
 
 class VuiTTS:
 
     def __init__(self, model_path: str, device: str = "cuda"):
         self.model_path = model_path
+        self.device = device
         self.model = None
 
     def load_model(self):
         model = Vui.from_pretrained(checkpoint_path=self.model_path).to(self.device)
         self.model = model
 
     def __call__(self, text: str, output_file: str = "output.wav"):
+        if self.model is None:
+            self.load_model()
         waveform = render(self.model, text)
         torchaudio.save(output_file, waveform[0], 22050)
diff --git a/voicehub/models/vui/model.py b/voicehub/models/vui/model.py
@@ -386,8 +386,8 @@ def from_pretrained(
                 from huggingface_hub import hf_hub_download
 
                 checkpoint_path = hf_hub_download(
-                    "fluxions/vui",
-                    checkpoint_path,
+                    repo_id="fluxions/vui",
+                    filename=checkpoint_path,
                 )
             checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
 

Original file line number	Diff line number	Diff line change
`@@ -386,8 +386,8 @@ def from_pretrained(`
`386`	`386`	`from huggingface_hub import hf_hub_download`
`387`	`387`
`388`	`388`	`checkpoint_path = hf_hub_download(`
`389`		`- "fluxions/vui",`
`390`		`- checkpoint_path,`
	`389`	`+ repo_id="fluxions/vui",`
	`390`	`+ filename=checkpoint_path,`
`391`	`391`	`)`
`392`	`392`	`checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)`
`393`	`393`