Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/text2speech/Whisper_TTS/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
Converting text input to speech given the input language
# Whisper TTS model

This project uses WhisperSpeech to convert text to speech, with an option for voice cloning.
reference - https://github.com/collabora/WhisperSpeech

## Setup and Installation

1. Create and activate a virtual environment:
- Linux
- python -m venv venv
- source venv/bin/activate
- Windows
- python -m venv venv
- venv\Scripts\activate

2. Install the required packages:
- ```bash
pip install -r requirements.txt
```
3. Usage

You can add the reference audio from a drive download link to create a more personalized transcriptions or a pass a direct audio file path.
- voice with cloning
https://github.com/user-attachments/assets/fa4bc138-e17a-4e1c-a88a-8dd5f37e0db4


- output without cloning
https://github.com/user-attachments/assets/1bb4ee27-3730-4cca-85f0-f72629d7c640

Binary file not shown.
Binary file not shown.
Binary file added src/text2speech/Whisper_TTS/reference_audio.wav
Binary file not shown.
4 changes: 4 additions & 0 deletions src/text2speech/Whisper_TTS/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
torch
torchaudio
whisperspeech
requests
69 changes: 69 additions & 0 deletions src/text2speech/Whisper_TTS/whisper_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import torch
from whisperspeech.pipeline import Pipeline
import torchaudio
import os
import requests

def download_reference_audio(url, output_path):
response = requests.get(url)
if response.status_code == 200:
with open(output_path, "wb") as file:
file.write(response.content)
print(f"File downloaded and saved to {output_path}")
return True
else:
print(f"Failed to download the file. Status code: {response.status_code}")
return False

def initialize_pipeline():
return Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')

def text_to_speech(pipe, text, reference_audio_path=None, cps=10.5):
if reference_audio_path:
audio = pipe.generate(text, lang='en', cps=cps, speaker=reference_audio_path)
else:
audio = pipe.generate(text, lang='en', cps=cps)

audio_cpu = audio.cpu().squeeze()
if audio_cpu.dim() == 1:
audio_cpu = audio_cpu.unsqueeze(0)

return audio_cpu

def save_audio(audio, output_file, sample_rate=24000):
torchaudio.save(output_file, audio, sample_rate=sample_rate, encoding="PCM_F")
print(f"Generated audio file: {output_file}")

def transcribe(text, output_file, use_voice_cloning=False, reference_audio_path=None):
if use_voice_cloning and not reference_audio_path:
print("Error: Voice cloning requested but no reference audio path provided.")
return False

if use_voice_cloning and not os.path.exists(reference_audio_path):
print(f"Error: Reference audio file not found at {reference_audio_path}")
return False

pipe = initialize_pipeline()

audio = text_to_speech(pipe, text, reference_audio_path if use_voice_cloning else None)

save_audio(audio, output_file)

return True

if __name__ == "__main__":
text_to_transcribe = "This is a test of the WhisperSpeech text-to-speech system with optional voice cloning."
output_file = "output.wav"
# you can add a add your audio download link from drive for reference audio
reference_audio_url = "https://drive.google.com/uc?export=download&id=1P5kM5-U9tk3bdw309ybIbkQoxWw04YpM"
reference_audio_path = "reference_audio.wav"

# Download reference audio if it doesn't exist
if not os.path.exists(reference_audio_path):
download_reference_audio(reference_audio_url, reference_audio_path)

# Without voice cloning
transcribe(text_to_transcribe, "output_without_cloning.wav")

# you can add your audio file path as reference_audio_path as wav file
transcribe(text_to_transcribe, "output_with_cloning.wav", use_voice_cloning=True, reference_audio_path=reference_audio_path)